Compare commits

...

3 Commits
v25.4 ... v25.7

Author SHA1 Message Date
Jay D Dee
8f2f9ec3e9 v25.7 2025-11-15 10:44:32 -05:00
Jay D Dee
12480a3ea5 v25.6 2025-07-20 19:43:10 -04:00
Jay D Dee
aa47e880d5 v25.5 2025-07-09 01:32:38 -04:00
37 changed files with 1158 additions and 1231 deletions

View File

@@ -54,9 +54,9 @@ Supported Algorithms
allium Garlicoin allium Garlicoin
anime Animecoin anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250 argon2d250
argon2d500 argon2d500
argon2d1000
argon2d4096 argon2d4096
blake Blake-256 blake Blake-256
blake2b Blake2-512 blake2b Blake2-512

View File

@@ -75,6 +75,25 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v25.7
Fixed a bug calculating TTF longer than 1 year.
Faster argon2d.
Faster hamsi AVX512.
Faster switfftx AVX2.
Other small fixes and improvements.
v25.6
Added argon2d1000, argon2d16000 algos.
Target specific AES optimizations improve shavite for ARM64 & x86_64.
v25.5
x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
Other small bug fixes.
v25.4 v25.4
x86_64: improved handling of vector constants used for byte permutations. x86_64: improved handling of vector constants used for byte permutations.

View File

@@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_ANIME: rc = register_anime_algo ( gate ); break; case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break; case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break; case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
case ALGO_ARGON2D1000: rc = register_argon2d1000_algo ( gate ); break;
case ALGO_ARGON2D16000: rc = register_argon2d16000_algo ( gate ); break;
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break; case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break; case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break; case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;

View File

@@ -172,8 +172,11 @@ void ( *set_work_data_endian ) ( struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* ); json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
// Deprecated
set_t optimizations; set_t optimizations;
int ( *get_work_data_size ) (); int ( *get_work_data_size ) ();
int ntime_index; int ntime_index;
int nbits_index; int nbits_index;
int nonce_index; // use with caution, see warning below int nonce_index; // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx ); void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
// OpenSSL sha256 deprecated
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
bool std_le_work_decode( struct work *work ); bool std_le_work_decode( struct work *work );
bool std_be_work_decode( struct work *work ); bool std_be_work_decode( struct work *work );

View File

@@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
// generic, works with most variations of argon2d
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
algo_gate.hash( hash, edata, thr_id );
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
void argon2d250_hash( void *output, const void *input ) void argon2d250_hash( void *output, const void *input )
{ {
argon2_context context; argon2_context context;
@@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d ); argon2_ctx( &context, Argon2_d );
} }
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
swab32_array( edata, pdata, 20 );
do {
be32enc(&edata[19], nonce);
argon2d250_hash( hash, edata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool register_argon2d250_algo( algo_gate_t* gate ) bool register_argon2d250_algo( algo_gate_t* gate )
{ {
gate->scanhash = (void*)&scanhash_argon2d250; gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d250_hash; gate->hash = (void*)&argon2d250_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0; opt_target_factor = 65536.0;
return true; return true;
} }
@@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d ); argon2_ctx( &context, Argon2_d );
} }
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
argon2d500_hash( hash, edata );
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !bench ) )
{
pdata[19] = bswap_32( nonce );;
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
bool register_argon2d500_algo( algo_gate_t* gate ) bool register_argon2d500_algo( algo_gate_t* gate )
{ {
gate->scanhash = (void*)&scanhash_argon2d500; gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d500_hash; gate->hash = (void*)&argon2d500_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; opt_target_factor = 65536.0;
return true;
}
void argon2d1000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 1000; // Memory in KiB (1MB)
context.lanes = 8; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 2; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d1000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d1000_hash;
opt_target_factor = 65536.0;
return true;
}
void argon2d16000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 16000; // Memory in KiB (~16384KB)
context.lanes = 1; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 1; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d16000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d16000_hash;
opt_target_factor = 65536.0; opt_target_factor = 65536.0;
return true; return true;
} }
@@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce; const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t n = first_nonce; uint32_t n = first_nonce;
const int thr_id = mythr->id; // thr_id arg is deprecated const int thr_id = mythr->id;
uint32_t t_cost = 1; // 1 iteration uint32_t t_cost = 1; // 1 iteration
uint32_t m_cost = 4096; // use 4MB uint32_t m_cost = 4096; // use 4MB
uint32_t parallelism = 1; // 1 thread, 2 lanes uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
bool register_argon2d4096_algo( algo_gate_t* gate ) bool register_argon2d4096_algo( algo_gate_t* gate )
{ {
gate->scanhash = (void*)&scanhash_argon2d4096; gate->scanhash = (void*)&scanhash_argon2d4096;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
opt_target_factor = 65536.0; opt_target_factor = 65536.0;
return true; return true;
} }

View File

@@ -4,22 +4,27 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <stdint.h> #include <stdint.h>
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Credits: version = 0x10, m_cost = 250. // Credits: version = 0x10, m_cost = 250.
bool register_argon2d250_algo( algo_gate_t* gate ); bool register_argon2d250_algo( algo_gate_t* gate );
void argon2d250_hash( void *state, const void *input ); void argon2d250_hash( void *state, const void *input );
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Dynamic: version = 0x10, m_cost = 500. // Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d500_algo( algo_gate_t* gate ); bool register_argon2d500_algo( algo_gate_t* gate );
void argon2d500_hash( void *state, const void *input ); void argon2d500_hash( void *state, const void *input );
int scanhash_argon2d500( struct work *work, uint32_t max_nonce, // Zero Dynamics Cash: version = 0x10, m_cost = 1000.
uint64_t *hashes_done, struct thr_info *mythr ); bool register_argon2d1000_algo( algo_gate_t* gate );
void argon2d1000_hash( void *state, const void *input );
bool register_argon2d16000_algo( algo_gate_t* gate );
void argon2d16000_hash( void *state, const void *input );
// Unitus: version = 0x13, m_cost = 4096. // Unitus: version = 0x13, m_cost = 4096.
bool register_argon2d4096_algo( algo_gate_t* gate ); bool register_argon2d4096_algo( algo_gate_t* gate );

View File

@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = v128_alignr8(B1, B0, 8); \ v128_t t = v128_alignr8( B1, B0, 8 ); \
v128_t t1 = v128_alignr8(B0, B1, 8); \ B1 = v128_alignr8( B0, B1, 8 ); \
B0 = t0; \ B0 = t; \
B1 = t1; \ t = v128_alignr8( D1, D0, 8 ); \
\ D0 = v128_alignr8( D0, D1, 8 ); \
t0 = C0; \ D1 = t; \
C0 = C1; \ }
C1 = t0; \
\
t0 = v128_alignr8(D1, D0, 8); \
t1 = v128_alignr8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = v128_alignr8(B0, B1, 8); \ v128_t t = v128_alignr8( B0, B1, 8 ); \
v128_t t1 = v128_alignr8(B1, B0, 8); \ B1 = v128_alignr8( B1, B0, 8 ); \
B0 = t0; \ B0 = t; \
B1 = t1; \ t = v128_alignr8( D0, D1, 8 ); \
\ D0 = v128_alignr8( D1, D0, 8 ); \
t0 = C0; \ D1 = t; \
C0 = C1; \ }
C1 = t0; \
\
t0 = v128_alignr8(D0, D1, 8); \
t1 = v128_alignr8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#else /* SSE2 */ #else /* SSE2 */
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = D0; \ v128_t t = D0; \
v128_t t1 = B0; \ D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
D0 = C0; \ D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
C0 = C1; \ t = B0; \
C1 = D0; \ B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \ B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \ }
B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \ #define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
} while ((void)0, 0) { \
v128_t t = B0; \
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
t = D0; \
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
}
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0, t1; \
t0 = C0; \
C0 = C1; \
C1 = t0; \
t0 = B0; \
t1 = D0; \
B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \
D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#endif #endif
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ #define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
do { \ { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
\ DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
\ G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ }
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
#else /* __AVX2__ */ #else /* __AVX2__ */
#include <immintrin.h> #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \ do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
tmp1 = C0; \ B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
} while(0); } while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do { \ do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
tmp1 = C0; \ B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
} while((void)0, 0); } while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \ do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0); } while((void)0, 0);
@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do{ \ do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\ G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0); } while((void)0, 0);
@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#include <immintrin.h> #include <immintrin.h>
/*
static inline __m512i muladd(__m512i x, __m512i y) static inline __m512i muladd(__m512i x, __m512i y)
{ {
__m512i z = _mm512_mul_epu32(x, y); __m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
} }
*/
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
}
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
}
/*
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
A0 = muladd(A0, B0); \ A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 24); \ B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \ B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0) } while ((void)0, 0)
*/
/*
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
A0 = muladd(A0, B0); \ A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 63); \ B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \ B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0) } while ((void)0, 0)
*/
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while ((void)0, 0) } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \ do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while ((void)0, 0) } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \ do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0) } while ((void)0, 0)
static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 };
static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 };
static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 };
static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 };
#define SWAP_HALVES(A0, A1) \ #define SWAP_HALVES(A0, A1) \
do { \ do { \
__m512i t; \ __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
A0 = t; \ A0 = t; \
} while((void)0, 0) } while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
A0 = t; \
}
#define UNSWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
A0 = t; \
}
/*
#define SWAP_QUARTERS(A0, A1) \ #define SWAP_QUARTERS(A0, A1) \
do { \ do { \
SWAP_HALVES(A0, A1); \ SWAP_HALVES(A0, A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
} while((void)0, 0) } while((void)0, 0)
*/
/*
#define UNSWAP_QUARTERS(A0, A1) \ #define UNSWAP_QUARTERS(A0, A1) \
do { \ do { \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
SWAP_HALVES(A0, A1); \ SWAP_HALVES(A0, A1); \
} while((void)0, 0) } while((void)0, 0)
*/
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
do { \ do { \

View File

@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mj[14] = mm256_rol_64( M[14], 15 ); mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 ); mj[15] = mm256_rol_64( M[15], 16 );
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL ); __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL ); static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K ); qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm256_add_epi64( K, Kincr ); K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] ) __m512i dH[16] )
{ {
__m512i qt[32], xl, xh; __m512i qt[32], xl, xh;
__m512i mh[16]; __m512i mh[16], mj[16];
int i; int i;
for ( i = 0; i < 16; i++ ) for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
__m512i mj[16];
mj[ 0] = mm512_rol_64( M[ 0], 1 ); mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 ); mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 ); mj[ 2] = mm512_rol_64( M[ 2], 3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
mj[14] = mm512_rol_64( M[14], 15 ); mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 ); mj[15] = mm512_rol_64( M[15], 16 );
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL ); __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL ); static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K ); qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm512_add_epi64( K, Kincr ); K = _mm512_add_epi64( K, Kincr );

View File

@@ -503,32 +503,28 @@ do { \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
s4 = mm512_swap64_32( s4 ); \ s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \ t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
sD = mm512_swap64_32( sD ); \ sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \ t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t0, s9, t1 ); \ L8( s0, t0, s9, t1 ); \
s6 = mm512_swap64_32( s6 ); \ s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \ sF = mm512_swap64_32( sF ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
L8( s1, t2, sA, t3 ); \ L8( s1, t2, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \ s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \ sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
\ \
s7 = mm512_swap64_32( s7 ); \ t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
sC = mm512_swap64_32( sC ); \ t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t4, sB, t5 ); \ L8( s2, t4, sB, t5 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \ sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
s6 = mm512_swap64_32( s6 ); \ s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \ sF = mm512_swap64_32( sF ); \
\ \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
L8( s3, t2, s8, t3 ); \ L8( s3, t2, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \ s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
s7 = mm512_swap64_32( s7 ); \ s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \ sC = mm512_swap64_32( sC ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \ t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \ t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \ t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \ t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \ L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \ s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \ s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \ s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \ sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1268,7 +1263,7 @@ do { \
} while (0) } while (0)
#endif #endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions // v3, 15 instructions
#define SBOX( a, b, c, d ) \ #define SBOX( a, b, c, d ) \
{ \ { \
__m256i tb, td; \ __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
#endif #endif
/* /*
/ v2, 16 instructions, 10 TL equivalent instructions / v2, 16 instructions
#define SBOX( a, b, c, d ) \ #define SBOX( a, b, c, d ) \
{ \ { \
__m256i t = mm256_xorand( d, a, c ); \ __m256i t = mm256_xorand( d, a, c ); \

View File

@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
__m512i t = a0; \ __m512i t = a0; \
a0 = mm512_xoror( a3, a0, a1 ); \ a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512( a2, a3 ); \ a2 = _mm512_xor_si512( a2, a3 ); \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \ a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0); \ a2 = mm512_xorand( a1, a2, a0); \
a1 = _mm512_or_si512( a1, a3 ); \ a1 = _mm512_or_si512( a1, a3 ); \
a3 = _mm512_xor_si512( a3, a2 ); \ a3 = _mm512_xor_si512( a3, a2 ); \
t = _mm512_xor_si512( t, a1 ); \ t = _mm512_xor_si512( t, a1 ); \
a2 = _mm512_and_si512( a2, a1 ); \ a2 = _mm512_and_si512( a2, a1 ); \
a1 = mm512_xnor( a1, a0 ); \ a1 = mm512_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }
@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m256i t = a0; \ __m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \ a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \ a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \ a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \ a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \ a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \ a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \ t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \ a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \ a1 = mm256_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }

View File

@@ -69,18 +69,18 @@
v128_t t = a0; \ v128_t t = a0; \
a0 = v128_xoror( a3, a0, a1 ); \ a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \ a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
a3 = v128_xorand( a2, a3, t ); \ a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \ a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \ a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \ a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \ t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \ a2 = v128_and( a2, a1 ); \
a1 = v128_xnor( a1, a0 ); \ a1 = v128_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }
#else #elif defined(__ARM_NEON) || defined(__SSE2__)
#define SUBCRUMB( a0, a1, a2, a3 ) \ #define SUBCRUMB( a0, a1, a2, a3 ) \
{ \ { \

View File

@@ -441,57 +441,6 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \ W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
#if defined(VL256)
// AVX512 or AVX10-256
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
T1 = _mm256_add_epi32( T1, H ); \
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, j ); \
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, j ); \
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, j ); \
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, j ); \
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, j ); \
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, j ); \
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, j ); \
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, j ); \
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, j ); \
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, j ); \
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, j ); \
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, j ); \
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
// Not used with AVX512, needed to satisfy the compiler
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
}
#else // AVX2
#define CHx(X, Y, Z) \ #define CHx(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
@@ -503,61 +452,58 @@ do { \
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \ { \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ H = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
v256_32( K256[(i)+(j)] ) ); \ v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ __m256i T = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
Y_xor_Z = X_xor_Y; \ Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \ D = _mm256_add_epi32( D, H ); \
H = _mm256_add_epi32( T1, T2 ); \ H = _mm256_add_epi32( H, T ); \
} }
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \ { \
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \ __m256i T1 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
__m256i T1 = BSG2_1x( E ); \ H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
__m256i T2 = BSG2_0x( A ); \ __m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
T1 = _mm256_add_epi32( T1, H ); \
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \ H = _mm256_add_epi32( H, T1 ); \
Y_xor_Z = X_xor_Y; \ Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \ D = _mm256_add_epi32( D, H ); \
H = _mm256_add_epi32( T1, T2 ); \ H = _mm256_add_epi32( H, T2 ); \
} while (0) }
// read Y_xor_Z, update X_xor_Y // read Y_xor_Z, update X_xor_Y
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) ) Y_xor_Z ) )
// start with toc initialized to y^z: toc = B ^ C // start with toc initialized to y^z, toc = B ^ C for first ound.
// First round reads toc as Y_xor_Z and saves X_xor_Y as tic. // First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc. // Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \ #define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
do { \ { \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \ __m256i T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
W[ i0 ] ); \ W[ i0 ] ); \
__m256i T1 = BSG2_1x( E ); \ H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
__m256i T2 = BSG2_0x( A ); \ __m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
T1 = _mm256_add_epi32( T1, H ); \
T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \ T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \ H = _mm256_add_epi32( H, T1 ); \
D = _mm256_add_epi32( D, T1 ); \ D = _mm256_add_epi32( D, H ); \
H = _mm256_add_epi32( T1, T2 ); \ H = _mm256_add_epi32( H, T2 ); \
\ \
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \ T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
W[ (i1) ] ); \ W[ (i1) ] ); \
T1 = BSG2_1x( D ); \ G = _mm256_add_epi32( G, BSG2_1x( D ) ); \
T2 = BSG2_0x( H ); \ T2 = BSG2_0x( H ); \
T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \ T1 = _mm256_add_epi32( T1, CHx( D, E, F ) ); \
T1 = _mm256_add_epi32( T1, G ); \
T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \ T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
T1 = _mm256_add_epi32( T1, T0 ); \ G = _mm256_add_epi32( G, T1 ); \
C = _mm256_add_epi32( C, T1 ); \ C = _mm256_add_epi32( C, G ); \
G = _mm256_add_epi32( T1, T2 ); \ G = _mm256_add_epi32( G, T2 ); \
} while (0) }
#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ #define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
{ \ { \
@@ -572,8 +518,6 @@ do { \
SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \ SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \
} }
#endif // AVX512VL else AVX2
static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
const __m256i *in ) \ const __m256i *in ) \
{ {
@@ -650,9 +594,7 @@ void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 ); G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 ); H = _mm256_load_si256( state_in + 7 );
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
@@ -692,9 +634,7 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 ); G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 ); H = _mm256_load_si256( state_mid + 7 );
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G ); __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
// round 3 part 2, add nonces // round 3 part 2, add nonces
A = _mm256_add_epi32( A, W[3] ); A = _mm256_add_epi32( A, W[3] );
@@ -779,10 +719,10 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target ) const __m256i *state_in, const uint32_t *target )
{ {
__m256i A, B, C, D, E, F, G, H, T0, T1, T2; __m256i A, B, C, D, E, F, G, H, G57, H56;
__m256i vmask, targ, hash; __m256i vmask, targ, hash;
__m256i W[16]; memcpy_256( W, data, 16 ); __m256i W[16]; memcpy_256( W, data, 16 );
uint8_t flip, t6_mask; uint8_t flip, t6_mask, t7_mask;
A = _mm256_load_si256( state_in ); A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in+1 ); B = _mm256_load_si256( state_in+1 );
@@ -793,12 +733,10 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_in+6 ); G = _mm256_load_si256( state_in+6 );
H = _mm256_load_si256( state_in+7 ); H = _mm256_load_si256( state_in+7 );
const __m256i IV7 = H; const __m256i istate6 = G;
const __m256i IV6 = G; const __m256i istate7 = H;
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
// rounds 0 to 16, ignore zero padding W[9..14] // rounds 0 to 16, ignore zero padding W[9..14]
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
@@ -841,11 +779,9 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] ); W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C ); Y_xor_Z = _mm256_xor_si256( B, C );
#endif
// rounds 48 to 57 // Rounds 48 to 55
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
@@ -854,77 +790,83 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 ); SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 ); SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 ); SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// round 58 to 60 part 1 // Round 56
T0 = _mm256_add_epi32( v256_32( K256[58] ), H = _mm256_add_epi32( v256_32( K256[56] ),
mm256_add4_32( BSG2_1x( E ), CHx( E, F, G ), W[ 8], H ) );
D = _mm256_add_epi32( D, H );
H56 = _mm256_add_epi32( H, _mm256_add_epi32( BSG2_0x( A ),
MAJx( A, B, C ) ) );
Y_xor_Z = X_xor_Y;
// Rounds 57 to 60 part 1
G = _mm256_add_epi32( v256_32( K256[57] ),
mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), W[ 9], G ) );
C = _mm256_add_epi32( C, G );
G57 = _mm256_add_epi32( G, MAJx( H56, A, B ) );
F = _mm256_add_epi32( v256_32( K256[58] ),
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) ); mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
B = _mm256_add_epi32( B, T0 ); B = _mm256_add_epi32( B, F );
T1 = _mm256_add_epi32( v256_32( K256[59] ), E = _mm256_add_epi32( v256_32( K256[59] ),
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) ); mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
A = _mm256_add_epi32( A, T1 ); A = _mm256_add_epi32( A, E );
T2 = _mm256_add_epi32( v256_32( K256[60] ), D = _mm256_add_epi32( v256_32( K256[60] ),
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) ); mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
H = _mm256_add_epi32( H, T2 ); H = _mm256_add_epi32( H56, D );
// Got H, test it. // Got H, test it.
hash = mm256_bswap_32( _mm256_add_epi32( H, istate7 ) );
targ = v256_32( target[7] ); targ = v256_32( target[7] );
hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) ); // A simple unsigned LE test is complicated by the lack of a cmple
if ( target[7] ) // instruction, and lack of unsigned compares in AVX2.
{
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^ if ( likely( 0xff == ( t7_mask = ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )) mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )))
return 0; return 0;
}
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) ); t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
// round 58 part 2 // Round 57 part 2
F = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( G ), G57 = _mm256_add_epi32( G57, BSG2_0x( H56 ) );
MAJx( G, H, A ) ) ); Y_xor_Z = X_xor_Y;
// round 61 part 1
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm256_add_epi32( v256_32( K256[61] ),
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
G = _mm256_add_epi32( G, T0 );
if ( t6_mask ) // Round 61 part 1
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
C = _mm256_add_epi32( v256_32( K256[61] ),
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
G = _mm256_add_epi32( G57, C );
if ( t6_mask == (0xff & ~t7_mask ) )
{ {
// Testing H was inconclusive: hash7 == target7, need to test G // Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, v256_32( target[6] ) ); targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) ); hash = mm256_bswap_32( _mm256_add_epi32( G, istate6 ) );
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
{
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0 != ( t6_mask & ( flip ^ if ( likely( 0 != ( t6_mask & ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) )) mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
return 0; return 0;
if ( likely( ( target[6] == 0x80000000 )
&& ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
return 0;
}
// else inconclusive, testing targ5 isn't practical, fininsh hashing
} }
// At this point either the hash will be good or the test was inconclusive. // Rounds 58 to 61 part 2
// If the latter it's probably a high target difficulty with a nearly equal F = _mm256_add_epi32( F, _mm256_add_epi32( BSG2_0x( G57 ),
// high difficulty hash that has a good chance of being good. MAJx( G57, H, A ) ) );
Y_xor_Z = X_xor_Y;
// rounds 59 to 61 part 2 E = _mm256_add_epi32( E, _mm256_add_epi32( BSG2_0x( F ),
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ), MAJx( F, G57, H ) ) );
MAJx( F, G, H ) ) ); Y_xor_Z = X_xor_Y;
D = _mm256_add_epi32( T2, _mm256_add_epi32( BSG2_0x( E ),
MAJx( E, F, G ) ) ); D = _mm256_add_epi32( D, _mm256_add_epi32( BSG2_0x( E ),
C = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( D ), MAJx( E, F, G57 ) ) );
Y_xor_Z = X_xor_Y;
C = _mm256_add_epi32( C, _mm256_add_epi32( BSG2_0x( D ),
MAJx( D, E, F ) ) ); MAJx( D, E, F ) ) );
Y_xor_Z = X_xor_Y;
// rounds 62 & 63 // Rounds 62 & 63
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
@@ -1077,40 +1019,26 @@ void sha256_8x32_full( void *dst, const void *data, size_t len )
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ #define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \ { \
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \ __m512i T1 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
__m512i T1 = BSG2_1x16( E ); \ H = _mm512_add_epi32( H, BSG2_1x16( E ) ); \
__m512i T2 = BSG2_0x16( A ); \ __m512i T2 = BSG2_0x16( A ); \
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \ T1 = _mm512_add_epi32( T1, CHx16( E, F, G ) ); \
T1 = _mm512_add_epi32( T1, H ); \
T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \ T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
T1 = _mm512_add_epi32( T1, T0 ); \ H = _mm512_add_epi32( H, T1 ); \
D = _mm512_add_epi32( D, T1 ); \ D = _mm512_add_epi32( D, H ); \
H = _mm512_add_epi32( T1, T2 ); \ H = _mm512_add_epi32( H, T2 ); \
} while (0) }
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ #define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \ { \
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \ H = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
v512_32( K256[(i)+(j)] ) ); \ v512_32( K256[(i)+(j)] ) ); \
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ __m512i T = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \ D = _mm512_add_epi32( D, H ); \
H = _mm512_add_epi32( T1, T2 ); \ H = _mm512_add_epi32( H, T ); \
} }
/*
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
do { \
__m512i T1, T2; \
__m512i K = v512_32( K256[( (j)+(i) )] ); \
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
} while (0)
*/
#define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ #define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \ SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \ SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
@@ -1332,10 +1260,9 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target ) const __m512i *state_in, const uint32_t *target )
{ {
__m512i A, B, C, D, E, F, G, H, hash, targ; __m512i A, B, C, D, E, F, G, H, hash, targ, G57, H56;
__m512i T0, T1, T2;
__m512i W[16]; memcpy_512( W, data, 16 ); __m512i W[16]; memcpy_512( W, data, 16 );
__mmask16 t6_mask; __mmask16 mask;
A = _mm512_load_si512( state_in ); A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in+1 ); B = _mm512_load_si512( state_in+1 );
@@ -1346,8 +1273,8 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
G = _mm512_load_si512( state_in+6 ); G = _mm512_load_si512( state_in+6 );
H = _mm512_load_si512( state_in+7 ); H = _mm512_load_si512( state_in+7 );
const __m512i IV6 = G; const __m512i istate6 = G;
const __m512i IV7 = H; const __m512i istate7 = H;
// rounds 0 to 8 // rounds 0 to 8
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
@@ -1419,7 +1346,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] ); W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] );
// Rounds 48 to 57 // Rounds 48 to 55
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
@@ -1428,62 +1355,67 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 ); SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 ); SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 ); SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// rounds 58 to 60 part 1 // Round 56
T0 = _mm512_add_epi32( v512_32( K256[58] ), H = _mm512_add_epi32( v512_32( K256[56] ),
mm512_add4_32( BSG2_1x16( E ), CHx16( E, F, G ), W[ 8], H ) );
D = _mm512_add_epi32( D, H );
H56 = _mm512_add_epi32( H, _mm512_add_epi32( BSG2_0x16( A ),
MAJx16( A, B, C ) ) );
// Rounds 57 to 60 part 1
G = _mm512_add_epi32( v512_32( K256[57] ),
mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
C = _mm512_add_epi32( C, G );
G57 = _mm512_add_epi32( G, MAJx16( H56, A, B ) );
F = _mm512_add_epi32( v512_32( K256[58] ),
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) ); mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
B = _mm512_add_epi32( B, T0 ); B = _mm512_add_epi32( B, F );
T1 = _mm512_add_epi32( v512_32( K256[59] ), E = _mm512_add_epi32( v512_32( K256[59] ),
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) ); mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
A = _mm512_add_epi32( A, T1 ); A = _mm512_add_epi32( A, E );
T2 = _mm512_add_epi32( v512_32( K256[60] ), D = _mm512_add_epi32( v512_32( K256[60] ),
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) ); mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
H = _mm512_add_epi32( H, T2 ); H = _mm512_add_epi32( H56, D );
// got H, test it against target[7] // got final H, test it against target[7]
hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) ); hash = mm512_bswap_32( _mm512_add_epi32( H , istate7 ) );
targ = v512_32( target[7] ); targ = v512_32( target[7] );
if ( target[7] ) if ( likely( 0 == ( mask = _mm512_cmple_epu32_mask( hash, targ ) ) ))
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
return 0; return 0;
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
// round 58 part 2 // Round 57 part 2
F = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( G ), G57 = _mm512_add_epi32( G57, BSG2_0x16( H56 ) );
MAJx16( G, H, A ) ) );
// round 61 part 1 // Round 61 part 1
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] ); W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm512_add_epi32( v512_32( K256[61] ), C = _mm512_add_epi32( v512_32( K256[61] ),
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) ); mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
G = _mm512_add_epi32( G, T0 ); G = _mm512_add_epi32( G57, C );
// got G, test it against target[6] if indicated // got final G, test it against target[6] if indicated.
if ( (uint16_t)t6_mask ) if ( mask == _mm512_cmpeq_epi32_mask( hash, targ ) )
{ {
hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) ); hash = mm512_bswap_32( _mm512_add_epi32( G, istate6 ) );
targ = v512_32( target[6] ); targ = v512_32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) )) if ( likely( 0 == _mm512_mask_cmple_epu32_mask( mask, hash, targ ) ))
return 0; return 0;
} }
// round 59 part 2 // Round 58 to 61 part 2
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16( F ), F = _mm512_add_epi32( F, _mm512_add_epi32( BSG2_0x16( G57 ),
MAJx16( F, G, H ) ) ); MAJx16( G57, H, A ) ) );
E = _mm512_add_epi32( E, _mm512_add_epi32( BSG2_0x16( F ),
// round 60 part 2 MAJx16( F, G57, H ) ) );
D = _mm512_add_epi32( T2, _mm512_add_epi32( BSG2_0x16( E ), D = _mm512_add_epi32( D, _mm512_add_epi32( BSG2_0x16( E ),
MAJx16( E, F, G ) ) ); MAJx16( E, F, G57 ) ) );
C = _mm512_add_epi32( C, _mm512_add_epi32( BSG2_0x16( D ),
// round 61 part 2
C = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( D ),
MAJx16( D, E, F ) ) ); MAJx16( D, E, F ) ) );
// rounds 62, 63 // Rounds 62, 63
W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] ); W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );

View File

@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
}; };
#if defined(__AVX2__) && defined(__SHA512__) #if defined(__AVX__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension. // SHA-512 implemented using SHA512 CPU extension.
@@ -783,29 +783,6 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
mm256_ror_64( x, 61 ), \ mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) ) _mm256_srli_epi64( x, 6 ) )
#if defined(VL256)
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
#define CH( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
T1 = _mm256_add_epi64( T1, H ); \
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
T1 = _mm256_add_epi64( T1, T0 ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
#else // AVX2 only
#define CH(X, Y, Z) \ #define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
@@ -827,19 +804,12 @@ do { \
H = _mm256_add_epi64( T1, T2 ); \ H = _mm256_add_epi64( T1, T2 ); \
} while (0) } while (0)
#endif // AVX512VL AVX10_256
static void static void
sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] ) sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
{ {
int i; int i;
register __m256i A, B, C, D, E, F, G, H; register __m256i A, B, C, D, E, F, G, H;
#if !defined(VL256)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z; __m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[80]; __m256i W[80];
mm256_block_bswap_64( W , in ); mm256_block_bswap_64( W , in );
@@ -872,10 +842,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
H = v256_64( 0x5BE0CD19137E2179 ); H = v256_64( 0x5BE0CD19137E2179 );
} }
#if !defined(VL256)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C ); Y_xor_Z = _mm256_xor_si256( B, C );
#endif
for ( i = 0; i < 80; i += 8 ) for ( i = 0; i < 80; i += 8 )
{ {

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h" #include "simd-utils.h"
#include "sph_sha2.h" #include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__) #if defined(__SHA512__) && defined(__AVX__)
// Experimental, untested // Experimental, untested
// Need to substitute for sph_sha512 // Need to substitute for sph_sha512

View File

@@ -305,7 +305,7 @@ do { \
xb0 = mm512_rol_32( xb0, 1 ); \ xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \ xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \ mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \ xb0 = mm512_nxor( xa0, xb0 ); \
} while (0) } while (0)
#define PERM_STEP_0_16 do { \ #define PERM_STEP_0_16 do { \
@@ -898,7 +898,7 @@ do { \
xb0 = mm256_rol_32( xb0, 1 ); \ xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \ xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \ mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \ xb0 = mm256_nxor( xa0, xb0 ); \
} while (0) } while (0)
#define PERM_STEP_0_8 do { \ #define PERM_STEP_0_8 do { \

View File

@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
__m512i *H = (__m512i*)ctx->h; __m512i *H = (__m512i*)ctx->h;
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2, const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
ctx->count1, ctx->count0 ); ctx->count1, ctx->count0 );
int r; const __m512i zero = _mm512_setzero_si512();
P0 = H[0]; P0 = H[0];
P1 = H[1]; P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
K6 = M[6]; K6 = M[6];
K7 = M[7]; K7 = M[7];
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); // round 0
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P0 = _mm512_xor_si512( P0, X ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
P2 = _mm512_xor_si512( P2, X ); for ( int r = 0; r < 3; r ++ )
// round
for ( r = 0; r < 3; r ++ )
{ {
// round 1, 5, 9 // round 1, 5, 9
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32( K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ) ); _mm512_aesenc_epi128( K0, zero ) ) );
if ( r == 0 ) if ( r == 0 )
K0 = _mm512_xor_si512( K0, K0 = _mm512_xor_si512( K0,
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) ); _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
K1 = _mm512_xor_si512( K0, K1 = _mm512_xor_si512( K0,
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );
if ( r == 1 ) if ( r == 1 )
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32( K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) ); _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K1, K2 = _mm512_xor_si512( K1,
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K2, K3 = _mm512_xor_si512( K2,
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( K3, K4 = _mm512_xor_si512( K3,
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
K5 = _mm512_xor_si512( K4, K5 = _mm512_xor_si512( K4,
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K5, K6 = _mm512_xor_si512( K5,
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K6, K7 = _mm512_xor_si512( K6,
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );
if ( r == 2 ) if ( r == 2 )
K7 = _mm512_xor_si512( K7, mm512_swap128_64( K7 = _mm512_xor_si512( K7, mm512_swap128_64(
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) ); _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
P1 = _mm512_xor_si512( P1, X );
// round 2, 6, 10 // round 2, 6, 10
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );
P2 = _mm512_xor_si512( P2, X );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );
P0 = _mm512_xor_si512( P0, X );
// round 3, 7, 11 // round 3, 7, 11
K0 = _mm512_xor_si512( mm512_shuflr128_32( K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); _mm512_aesenc_epi128( K0, zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
K1 = _mm512_xor_si512( mm512_shuflr128_32( K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); _mm512_aesenc_epi128( K1, zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( mm512_shuflr128_32( K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); _mm512_aesenc_epi128( K2, zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( mm512_shuflr128_32( K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); _mm512_aesenc_epi128( K3, zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );
P1 = _mm512_xor_si512( P1, X );
K4 = _mm512_xor_si512( mm512_shuflr128_32( K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); _mm512_aesenc_epi128( K4, zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
K5 = _mm512_xor_si512( mm512_shuflr128_32( K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); _mm512_aesenc_epi128( K5, zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( mm512_shuflr128_32( K6 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 ); _mm512_aesenc_epi128( K6, zero ) ), K5 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( mm512_shuflr128_32( K7 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); _mm512_aesenc_epi128( K7, zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );
P3 = _mm512_xor_si512( P3, X );
// round 4, 8, 12 // round 4, 8, 12
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
P0 = _mm512_xor_si512( P0, X );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
P2 = _mm512_xor_si512( P2, X );
} }
// round 13 // round 13
K0 = _mm512_xor_si512( mm512_shuflr128_32( K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); _mm512_aesenc_epi128( K0, zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
K1 = _mm512_xor_si512( mm512_shuflr128_32( K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); _mm512_aesenc_epi128( K1, zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( mm512_shuflr128_32( K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); _mm512_aesenc_epi128( K2, zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( mm512_shuflr128_32( K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); _mm512_aesenc_epi128( K3, zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
P3 = _mm512_xor_si512( P3, X );
K4 = _mm512_xor_si512( mm512_shuflr128_32( K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); _mm512_aesenc_epi128( K4, zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
K5 = _mm512_xor_si512( mm512_shuflr128_32( K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); _mm512_aesenc_epi128( K5, zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, mm512_swap64_32( K6 = mm512_xor3( K6, K5, mm512_swap64_32(
_mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) ); _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7= _mm512_xor_si512( mm512_shuflr128_32( K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); _mm512_aesenc_epi128( K7, zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
P1 = _mm512_xor_si512( P1, X );
H[0] = _mm512_xor_si512( H[0], P2 ); H[0] = _mm512_xor_si512( H[0], P2 );
H[1] = _mm512_xor_si512( H[1], P3 ); H[1] = _mm512_xor_si512( H[1], P3 );

View File

@@ -1,159 +0,0 @@
#include "miner.h"
#include "algo-gate-api.h"
#include <string.h>
#include <stdint.h>
#include "sph_shavite.h"
extern void inkhash(void *state, const void *input)
{
sph_shavite512_context ctx_shavite;
uint32_t hash[16];
sph_shavite512_init(&ctx_shavite);
sph_shavite512 (&ctx_shavite, (const void*) input, 80);
sph_shavite512_close(&ctx_shavite, (void*) hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx_shavite, (void*) hash);
memcpy(state, hash, 32);
/*
int ii;
printf("result: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)state)[ii]);
};
printf ("\n");
*/
}
int scanhash_ink( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
//const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(32) hash64[8];
uint32_t endiandata[32];
//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
//we need bigendian data...
//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd....
int kk=0;
for (; kk < 32; kk++)
{
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
};
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFFFF)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFFF0)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFF00)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFF000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFFFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFF0000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_shavite_algo( algo_gate_t* gate )
{
algo_not_implemented();
return false;
// gate->scanhash = (void*)&scanhash_ink;
// gate->hash = (void*)&inkhash;
// return true;
};

View File

@@ -50,7 +50,8 @@ extern "C"{
#pragma warning (disable: 4146) #pragma warning (disable: 4146)
#endif #endif
static const sph_u32 IV512[] = { static const sph_u32 IV512[] =
{
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
p2 = h[2]; p2 = h[2];
p3 = h[3]; p3 = h[3];
// round
k00 = m[0]; k00 = m[0];
x = v128_xor( p1, k00 );
x = v128_aesenc_nokey( x );
k01 = m[1]; k01 = m[1];
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = m[2]; k02 = m[2];
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = m[3]; k03 = m[3];
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = m[4]; k10 = m[4];
x = v128_xor( p3, k10 );
x = v128_aesenc_nokey( x );
k11 = m[5]; k11 = m[5];
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = m[6]; k12 = m[6];
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = m[7]; k13 = m[7];
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x ); // round 0
x = v128_xoraesenc( p1, k00 );
x = v128_xoraesenc( x, k01 );
x = v128_xoraesenc( x, k02 );
p0 = v128_xoraesencxor( x, k03, p0 );
x = v128_xoraesenc( p3, k10 );
x = v128_xoraesenc( x, k11 );
x = v128_xoraesenc( x, k12 );
p2 = v128_xoraesencxor( x, k13, p2 );
for ( r = 0; r < 3; r ++ ) for ( r = 0; r < 3; r ++ )
{ {
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
if ( r == 0 ) if ( r == 0 )
k00 = v128_xor( k00, v128_set32( k00 = v128_xor( k00, v128_set32(
~sc->count3, sc->count2, sc->count1, sc->count0 ) ); ~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = v128_xoraesenc( p0, k00 );
x = v128_xor( p0, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 ); k01 = v128_xor( k01, k00 );
if ( r == 1 ) if ( r == 1 )
k01 = v128_xor( k01, v128_set32( k01 = v128_xor( k01, v128_set32(
~sc->count0, sc->count1, sc->count2, sc->count3 ) ); ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = v128_xoraesenc( x, k01 );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 ); k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 ); x = v128_xoraesenc( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 ); k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 ); p3 = v128_xoraesencxor( x, k03, p3 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 ); k10 = v128_xor( k10, k03 );
x = v128_xoraesenc( p2, k10 );
x = v128_xor( p2, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 ); k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 ); x = v128_xoraesenc( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 ); k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 ); x = v128_xoraesenc( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 ); k13 = v128_xor( k13, k12 );
if ( r == 2 ) if ( r == 2 )
k13 = v128_xor( k13, v128_set32( k13 = v128_xor( k13, v128_set32(
~sc->count1, sc->count0, sc->count3, sc->count2 ) ); ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
p1 = v128_xoraesencxor( x, k13, p1 );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
// round 2, 6, 10 // round 2, 6, 10
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p3, k00 ); x = v128_xoraesenc( p3, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x ); k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xoraesenc( x, k01 );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xoraesenc( x, k02 );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
p2 = v128_xoraesencxor( x, k03, p2 );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p1, k10 ); x = v128_xoraesenc( p1, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x ); k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xoraesenc( x, k11 );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xoraesenc( x, k12 );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
p0 = v128_xoraesencxor( x, k13, p0 );
// round 3, 7, 11 // round 3, 7, 11
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) ); k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 ); k00 = v128_xor( k00, k13 );
x = v128_xor( p2, k00 ); x = v128_xoraesenc( p2, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 ); k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 ); x = v128_xoraesenc( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 ); k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 ); x = v128_xoraesenc( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 ); k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 ); p1 = v128_xoraesencxor( x, k03, p1 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 ); k10 = v128_xor( k10, k03 );
x = v128_xor( p0, k10 ); x = v128_xoraesenc( p0, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 ); k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 ); x = v128_xoraesenc( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 ); k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 ); x = v128_xoraesenc( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 ); k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 ); p3 = v128_xoraesencxor( x, k13, p3 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
// round 4, 8, 12 // round 4, 8, 12
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p1, k00 ); x = v128_xoraesenc( p1, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x ); k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xoraesenc( x, k01 );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xoraesenc( x, k02 );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
p0 = v128_xoraesencxor( x, k03, p0 );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p3, k10 ); x = v128_xoraesenc( p3, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x ); k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xoraesenc( x, k11 );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xoraesenc( x, k12 );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
p2 = v128_xoraesencxor( x, k13, p2 );
} }
// round 13 // round 13
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) ); k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 ); k00 = v128_xor( k00, k13 );
x = v128_xor( p0, k00 ); x = v128_xoraesenc( p0, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 ); k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 ); x = v128_xoraesenc( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 ); k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 ); x = v128_xoraesenc( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 ); k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 ); p3 = v128_xoraesencxor( x, k03, p3 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 ); k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 ); x = v128_xoraesenc( p2, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 ); k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 ); x = v128_xoraesenc( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, v128_xor( k11, v128_set32( k12 = v128_xor( k12, v128_xor( k11, v128_set32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = v128_xor( x, k12 ); x = v128_xoraesenc( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 ); k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 ); p1 = v128_xoraesencxor( x, k13, p1 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
h[0] = v128_xor( h[0], p2 ); h[0] = v128_xor( h[0], p2 );
h[1] = v128_xor( h[1], p3 ); h[1] = v128_xor( h[1], p3 );

View File

@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
{{ -30, 55, -58, -65, -95, -40, -98, 94 }}, {{ -30, 55, -58, -65, -95, -40, -98, 94 }},
}; };
#if defined(__AVX2__)
static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#define V128_00FF _mm256_castsi256_si128( V256_00FF )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#endif
#if defined(SIMD512)
static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V256_0101 _mm512_castsi512_si256( V512_0101 )
#define V128_0101 _mm512_castsi512_si128( V512_0101 )
static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V256_0080 _mm512_castsi512_si256( V512_0080 )
#define V128_0080 _mm512_castsi512_si128( V512_0080 )
#elif defined(__AVX2__)
static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V128_0101 _mm256_castsi256_si128( V256_0101 )
static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V128_0080 _mm256_castsi256_si128( V256_0080 )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
#endif
#if defined(__x86_64__) #if defined(__x86_64__)
#define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1) #define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
#define shufxor(x,s) XCAT(SHUFXOR_,s)(x) #define shufxor(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE(x) \ #define REDUCE(x) \
v128_sub16( v128_and( x, v128_64( \ v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )
0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\ #define EXTRA_REDUCE_S(x)\
v128_sub16( x, v128_and( \ v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )
v128_64( 0x0101010101010101 ), \
v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -293,10 +337,9 @@ do { \
// This will make the full FFT_64 in order. // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \ #define INTERLEAVE(i,j) \
do { \ do { \
v128u16_t t1= X(i); \ v128u16_t t = X(i); \
v128u16_t t2= X(j); \ X(i) = v128_unpacklo16( t, X(j) ); \
X(i) = v128_unpacklo16( t1, t2 ); \ X(j) = v128_unpackhi16( t, X(j) ); \
X(j) = v128_unpackhi16( t1, t2 ); \
} while(0) } while(0)
INTERLEAVE( 1, 0 ); INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x) #define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(VL256)
#define REDUCE(x) \ #define REDUCE(x) \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \ _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
#define EXTRA_REDUCE_S(x)\ #define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \ _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
_mm256_set1_epi64x( 0x0101010101010101 ), \ _mm256_cmpgt_epi16( x, V256_0080 ) ) )
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -917,10 +949,9 @@ do { \
// This will make the full FFT_64 in order. // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \ #define INTERLEAVE(i,j) \
do { \ do { \
__m256i t1= X(i); \ __m256i t = X(i); \
__m256i t2= X(j); \ X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
} while(0) } while(0)
INTERLEAVE( 1, 0 ); INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_srai_epi16( x, 8 ) ) _mm512_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S4w(x) \ #define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \ _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
_mm512_set1_epi64( 0x0101010101010101 ), \ _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
// generic, except it calls targetted macros // generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) ) #define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#if defined(__AVX2__) #if defined(__AVX2__)
__m256i F0, F1, F2, F3, F4, F5, F6, F7; __m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] ); __m256i *table = (__m256i*)fftTable;
__m256i tbl = table[ input[0] ];
__m256i *mul = (__m256i*)multipliers; __m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output; __m256i *out = (__m256i*)output;
F0 = _mm256_mullo_epi32( mul[0], tbl ); F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] ); tbl = table[ input[1] ];
F1 = _mm256_mullo_epi32( mul[1], tbl ); F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] ); tbl = table[ input[2] ];
F2 = _mm256_mullo_epi32( mul[2], tbl ); F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] ); tbl = table[ input[3] ];
F3 = _mm256_mullo_epi32( mul[3], tbl ); F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] ); tbl = table[ input[4] ];
F4 = _mm256_mullo_epi32( mul[4], tbl ); F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] ); tbl = table[ input[5] ];
F5 = _mm256_mullo_epi32( mul[5], tbl ); F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] ); tbl = table[ input[6] ];
F6 = _mm256_mullo_epi32( mul[6], tbl ); F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] ); tbl = table[ input[7] ];
F7 = _mm256_mullo_epi32( mul[7], tbl ); F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \ #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F1, F3 ); ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 ); ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 ); ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 ); F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 ); F7 = _mm256_slli_epi32( F7, 6 );
F5 = _mm256_slli_epi32( F5, 2 );
ADD_SUB( F0, F4 ); ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 ); ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 ); ADD_SUB( F2, F6 );

View File

@@ -4,11 +4,11 @@
# during development. However, the information contained may provide compilation # during development. However, the information contained may provide compilation
# tips to users. # tips to users.
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null
# armv9 needs gcc-13 # armv9 needs gcc-13
# -march-armv9-a includes SVE2 but no crypto # -march-armv9-a includes SVE2 but no crypto
# -march=armv9-a+crypto adds AES & SHA2 but not SHA512 # -march=armv9-a+crypto adds AES & SHA256 but not SHA512
make distclean || echo clean make distclean || echo clean
rm -f config.status rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-c
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv9 mv cpuminer cpuminer-armv9
# Apple M4: armv9.2, AES, SHA3, SVE2
make clean || echo clean
CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m4
# Apple M2: armv8.6, AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m2
# SVE2 available in armv8.5 # SVE2 available in armv8.5
make clean || echo clean make clean || echo clean
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2 mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4 # Apple M1: armv8.4 AES, SHA3
make clean || echo clean make clean || echo clean
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv8.4-crypto-sha3 mv cpuminer cpuminer-armv8.4-crypto-sha3
# Cortex-A76 (Orange Pi 5)
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8-crypto
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation # during develpment. However the information contained may provide compilation
# tips to users. # tips to users.
rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null ./clean-all.sh
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
make distclean || echo clean make distclean || echo clean
@@ -18,39 +18,37 @@ strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes mv cpuminer cpuminer-avx512-sha-vaes
# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12 # Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
#make clean || echo clean make clean || echo clean
#rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
#make -j $(nproc) make -j $(nproc)
#strip -s cpuminer strip -s cpuminer
#mv cpuminer cpuminer-alderlake mv cpuminer cpuminer-alderlake
# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14 # Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
# Arrowlake-s includes SHA512, Arrowlake does not? # Arrowlake-s includes SHA512, Arrowlake does not?
#make clean || echo clean make clean || echo clean
#rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
#make -j $(nproc) make -j $(nproc)
#strip -s cpuminer strip -s cpuminer
#mv cpuminer cpuminer-arrowlake-s mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14 # Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
# Granitrapids does not build with AVX10, SHA512 or APX. make clean || echo clean
# wait for Diamondrapids & gcc-15. rm -f config.status
#make clean || echo clean CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
#rm -f config.status make -j $(nproc)
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl strip -s cpuminer
#make -j $(nproc) mv cpuminer cpuminer-graniterapids
#strip -s cpuminer
#mv cpuminer cpuminer-graniterapids
# SHA512 AVX10.1 # Graniterapids + SHA512, AVX10.1
#make clean || echo clean make clean || echo clean
#rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
#make -j $(nproc) make -j $(nproc)
#strip -s cpuminer strip -s cpuminer
#mv cpuminer cpuminer-avx10_1 mv cpuminer cpuminer-avx10.1
# SHA512 AVX10.2 # SHA512 AVX10.2
#make clean || echo clean #make clean || echo clean
@@ -69,20 +67,23 @@ mv cpuminer cpuminer-avx512-sha-vaes
#mv cpuminer cpuminer-diamondrapids #mv cpuminer cpuminer-diamondrapids
# Zen5: AVX512 SHA VAES, requires gcc-14. # Zen5: AVX512 SHA VAES, requires gcc-14.
#make clean || echo clean make clean || echo clean
#rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
#make -j $(nproc) # zen4 is close enough for older compiler
#strip -s cpuminer #CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
#mv cpuminer cpuminer-zen5
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen5
# Zen4: AVX512 SHA VAES # Zen4: AVX512 SHA VAES
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
# Zen4: AVX512, SHA, VAES, needs gcc-12.3. # Zen4: AVX512, SHA, VAES, needs gcc-12.3.
#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
# Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl #CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-zen4 mv cpuminer cpuminer-zen4
@@ -115,8 +116,8 @@ mv cpuminer cpuminer-avx2-sha-vaes
# AVX2 SHA AES: AMD Zen1 # AVX2 SHA AES: AMD Zen1
make clean || echo done make clean || echo done
rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl
CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl #CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-avx2-sha mv cpuminer cpuminer-avx2-sha
@@ -138,13 +139,21 @@ make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-avx mv cpuminer cpuminer-avx
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron # SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse42-aes-sha
# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-aes-sse42 mv cpuminer cpuminer-sse42-aes
# SSE4.2: Intel Nehalem # SSE4.2: Intel Nehalem
make clean || echo clean make clean || echo clean

View File

@@ -2,8 +2,8 @@
# #
# make clean and rm all the targetted executables. # make clean and rm all the targetted executables.
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null
make distclean > /dev/null make distclean > /dev/null

View File

@@ -108,7 +108,24 @@ extern "C"{
} while (0) } while (0)
#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ #define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) { \
(Y0) = AES0[(X0) & 0xFF] \
^ AES1[((X1) >> 8) & 0xFF] \
^ AES2[((X2) >> 16) & 0xFF] \
^ AES3[((X3) >> 24) & 0xFF]; \
(Y1) = AES0[(X1) & 0xFF] \
^ AES1[((X2) >> 8) & 0xFF] \
^ AES2[((X3) >> 16) & 0xFF] \
^ AES3[((X0) >> 24) & 0xFF]; \
(Y2) = AES0[(X2) & 0xFF] \
^ AES1[((X3) >> 8) & 0xFF] \
^ AES2[((X0) >> 16) & 0xFF] \
^ AES3[((X1) >> 24) & 0xFF]; \
(Y3) = AES0[(X3) & 0xFF] \
^ AES1[((X0) >> 8) & 0xFF] \
^ AES2[((X1) >> 16) & 0xFF] \
^ AES3[((X2) >> 24) & 0xFF]; \
}
#endif #endif

28
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4. # Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.4' PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.4' PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1431,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1536,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 25.4 cpuminer-opt configure 25.7
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.4, which was It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='25.4' VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5808,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then : then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
printf %s "checking for $CXX option to enable C++11 features... " >&6; } printf %s "checking for $CXX option to enable C++11 features... " >&6; }
if test ${ac_cv_prog_cxx_11+y} if test ${ac_cv_prog_cxx_cxx11+y}
then : then :
printf %s "(cached) " >&6 printf %s "(cached) " >&6
else $as_nop else $as_nop
ac_cv_prog_cxx_11=no ac_cv_prog_cxx_cxx11=no
ac_save_CXX=$CXX ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */ /* end confdefs.h. */
@@ -5854,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then : then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
printf %s "checking for $CXX option to enable C++98 features... " >&6; } printf %s "checking for $CXX option to enable C++98 features... " >&6; }
if test ${ac_cv_prog_cxx_98+y} if test ${ac_cv_prog_cxx_cxx98+y}
then : then :
printf %s "(cached) " >&6 printf %s "(cached) " >&6
else $as_nop else $as_nop
ac_cv_prog_cxx_98=no ac_cv_prog_cxx_cxx98=no
ac_save_CXX=$CXX ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */ /* end confdefs.h. */
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 25.4, which was This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 25.4 cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [25.4]) AC_INIT([cpuminer-opt], [25.7])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4. # Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.4' PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.4' PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. 'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1424,7 +1424,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1528,7 +1528,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 25.4 cpuminer-opt configure 25.7
generated by GNU Autoconf 2.72 generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc. Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.4, which was It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"
am__api_version='1.17' am__api_version='1.18'
# Find a good install program. We prefer a C program (faster), # Find a good install program. We prefer a C program (faster),
@@ -3334,10 +3334,14 @@ am_lf='
' '
case `pwd` in case `pwd` in
*[\\\"\#\$\&\'\`$am_lf]*) *[\\\"\#\$\&\'\`$am_lf]*)
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;; as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
esac esac
case $srcdir in case $srcdir in
*[\\\"\#\$\&\'\`$am_lf\ \ ]*) *[\\\"\#\$\&\'\`$am_lf\ \ ]*)
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;; as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
esac esac
@@ -3764,7 +3768,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='25.4' VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}'
# We'll loop over all known methods to create a tar archive until one works. # We'll loop over all known methods to create a tar archive until one works.
_am_tools='gnutar pax cpio none' _am_tools='gnutar plaintar pax cpio none'
am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -' # The POSIX 1988 'ustar' format is defined with fixed-size fields.
# There is notably a 21 bits limit for the UID and the GID. In fact,
# the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
# and bug#13588).
am_max_uid=2097151 # 2^21 - 1
am_max_gid=$am_max_uid
# The $UID and $GID variables are not portable, so we need to resort
# to the POSIX-mandated id(1) utility. Errors in the 'id' calls
# below are definitely unexpected, so allow the users to see them
# (that is, avoid stderr redirection).
am_uid=`id -u || echo unknown`
am_gid=`id -g || echo unknown`
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
if test x$am_uid = xunknown; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5
printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;}
elif test $am_uid -le $am_max_uid; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
_am_tools=none
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
if test x$gm_gid = xunknown; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5
printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;}
elif test $am_gid -le $am_max_gid; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
_am_tools=none
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
printf %s "checking how to create a ustar tar archive... " >&6; }
# Go ahead even if we have the value already cached. We do so because we
# need to set the values for the 'am__tar' and 'am__untar' variables.
_am_tools=${am_cv_prog_tar_ustar-$_am_tools}
for _am_tool in $_am_tools; do
case $_am_tool in
gnutar)
for _am_tar in tar gnutar gtar; do
{ echo "$as_me:$LINENO: $_am_tar --version" >&5
($_am_tar --version) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); } && break
done
am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
am__untar="$_am_tar -xf -"
;;
plaintar)
# Must skip GNU tar: if it does not support --format= it doesn't create
# ustar tarball either.
(tar --version) >/dev/null 2>&1 && continue
am__tar='tar chf - "$$tardir"'
am__tar_='tar chf - "$tardir"'
am__untar='tar xf -'
;;
pax)
am__tar='pax -L -x ustar -w "$$tardir"'
am__tar_='pax -L -x ustar -w "$tardir"'
am__untar='pax -r'
;;
cpio)
am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
am__untar='cpio -i -H ustar -d'
;;
none)
am__tar=false
am__tar_=false
am__untar=false
;;
esac
# If the value was cached, stop now. We just wanted to have am__tar
# and am__untar set.
test -n "${am_cv_prog_tar_ustar}" && break
# tar/untar a dummy directory, and stop if the command works.
rm -rf conftest.dir
mkdir conftest.dir
echo GrepMe > conftest.dir/file
{ echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
(tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
rm -rf conftest.dir
if test -s conftest.tar; then
{ echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
($am__untar <conftest.tar) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
{ echo "$as_me:$LINENO: cat conftest.dir/file" >&5
(cat conftest.dir/file) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
fi
done
rm -rf conftest.dir
if test ${am_cv_prog_tar_ustar+y}
then :
printf %s "(cached) " >&6
else case e in #(
e) am_cv_prog_tar_ustar=$_am_tool ;;
esac
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
printf "%s\n" "$am_cv_prog_tar_ustar" >&6; }
@@ -4986,7 +5114,10 @@ _ACEOF
break break
fi fi
done done
rm -f core conftest* # aligned with autoconf, so not including core; see bug#72225.
rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \
conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \
conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM
unset am_i ;; unset am_i ;;
esac esac
fi fi
@@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 25.4, which was This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 25.4 cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.72, configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -921,39 +921,32 @@ out:
return rc; return rc;
} }
// returns the unit prefix and the hashrate appropriately scaled. // Does not account for leap years.
void scale_hash_for_display ( double* hashrate, char* prefix )
{
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
static inline void sprintf_et( char *str, long unsigned int seconds ) static inline void sprintf_et( char *str, long unsigned int seconds )
{ {
long unsigned int min = seconds / 60; long unsigned int minutes = seconds / 60;
long unsigned int sec = seconds % 60; if ( minutes )
long unsigned int hrs = min / 60; {
long unsigned int hours = minutes / 60;
if ( unlikely( hrs ) ) if ( hours )
{
long unsigned int days = hours / 24;
if ( days )
{ {
long unsigned int days = hrs / 24;
long unsigned int years = days / 365; long unsigned int years = days / 365;
if ( years ) // 0y000d if ( years )
sprintf( str, "%luy%lud", years, years % 365 ); sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
else if ( days ) // 0d00h else
sprintf( str, "%lud%02luh", days, hrs % 24 ); sprintf( str, "%lud%02luh", days, hours % 24 ); // 0d00h
else // 0h00m
sprintf( str, "%luh%02lum", hrs, min % 60 );
} }
else // 0m00s else
sprintf( str, "%lum%02lus", min, sec ); sprintf( str, "%luh%02lum", hours, minutes % 60 ); // 0h00m
}
else
sprintf( str, "%lum%02lus", minutes, seconds % 60 ); // 0m00s
}
else
sprintf( str, "%lus", seconds ); // 0s
} }
const long double exp32 = EXP32; // 2**32 const long double exp32 = EXP32; // 2**32
@@ -2833,67 +2826,29 @@ static void show_credits()
static bool cpu_capability( bool display_only ) static bool cpu_capability( bool display_only )
{ {
char cpu_brand[0x40]; char cpu_brand[0x40];
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_neon = has_neon(); // AArch64
bool cpu_has_sve = has_sve(); // aarch64 only, insignificant
bool cpu_has_sve2 = has_sve2(); // AArch64 only
bool cpu_has_sme = has_sme();
bool cpu_has_sme2 = has_sme2();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
bool cpu_has_vaes = has_vaes(); // X86_64 only
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false; bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false; bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64 version int sw_arm_arch = 0; // AArch64 version
bool sw_has_neon = false; // AArch64 bool sw_has_neon = false; // AArch64
bool sw_has_sve = false; // AArch64 bool sw_has_sve = false;
bool sw_has_sve2 = false; // AArch64 bool sw_has_sve2 = false;
bool sw_has_sme = false; bool sw_has_sme = false;
bool sw_has_sme2 = false; bool sw_has_sme2 = false;
bool sw_has_sse2 = false; // x86_64 bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64 bool sw_has_ssse3 = false;
bool sw_has_sse41 = false; // x86_64 bool sw_has_sse41 = false;
bool sw_has_sse42 = false; bool sw_has_sse42 = false;
bool sw_has_avx = false; bool sw_has_avx = false;
bool sw_has_avx2 = false; bool sw_has_avx2 = false;
bool sw_has_avx512 = false; bool sw_has_avx512 = false;
bool sw_has_avx10 = false; bool sw_has_avx10 = false;
bool sw_has_aes = false; bool sw_has_amx = false;
bool sw_has_vaes = false; bool sw_has_apx = false;
bool sw_has_aes = false; // x86_64 or AArch64
bool sw_has_vaes = false; // x86_64
bool sw_has_sha256 = false; // x86_64 or AArch64 bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64 bool sw_has_sha512 = false;
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_aes;
bool use_vaes;
bool use_sha256;
bool use_sha512;
bool use_neon;
bool use_none;
*/
#if defined(__x86_64__) #if defined(__x86_64__)
sw_has_x86_64 = true; sw_has_x86_64 = true;
#elif defined(__aarch64__) #elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true; sw_has_avx512 = true;
#endif #endif
// AVX10 version is not significant as of AVX10.2. If that changes use a better #if defined(__AVX10_1__) // version is not significant
// way to test the version than sequentially.
// #if defined(__AVX10_2__)
//
// #elif defined(__AVX10_1__)
#if defined(__AVX10_1__)
sw_has_avx10 = true; sw_has_avx10 = true;
#endif #endif
#ifdef __AMX_TILE__
sw_has_amx = true;
#endif
#ifdef __APX_F__
sw_has_apx = true;
#endif
// x86_64 or AArch64 // x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES) #if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
sw_has_neon = true; sw_has_neon = true;
#endif #endif
// FYI, SVE & SME not used by cpuminer
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
sw_has_sve = true; sw_has_sve = true;
#endif #endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
// Build // Build
printf( "SW built on " __DATE__ printf( "SW built on " __DATE__
#if defined(__clang__) #if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
__clang_patchlevel__ );
#elif defined(__GNUC__) #elif defined(__GNUC__)
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ ); " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif #endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
printf("CPU features: "); printf("CPU features: ");
if ( cpu_arch_x86_64() ) if ( cpu_arch_x86_64() )
{ {
if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() ); if ( has_avx10() ) printf( " AVX10.%d", avx10_version() );
if ( cpu_has_avx512 ) printf( " AVX512" ); else if ( has_avx512() ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " ); else if ( has_avx2() ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " ); else if ( has_avx() ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" ); else if ( has_sse42() ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" ); else if ( has_sse41() ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " ); else if ( has_ssse3() ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " ); else if ( has_sse2() ) printf( " SSE2 " );
if ( has_amx() ) printf( " AMX" );
if ( has_apx_f() ) printf( " APX" );
} }
else if ( cpu_arch_aarch64() ) else if ( cpu_arch_aarch64() )
{ {
if ( cpu_has_neon ) printf( " NEON" ); if ( has_neon() ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() ); if ( has_sve2() ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" ); else if ( has_sve() ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" ); if ( has_sme2() ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" ); else if ( has_sme() ) printf( " SME" );
} }
if ( cpu_has_vaes ) printf( " VAES" ); if ( has_vaes() ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" ); else if ( has_aes() ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" ); if ( has_sha512() ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" ); else if ( has_sha256() ) printf( " SHA256" );
printf("\nSW features: "); printf("\nSW features: ");
if ( sw_has_x86_64 ) if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_sse41 ) printf( " SSE4.1" ); else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " ); else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " ); else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_amx ) printf( " AMX" );
if ( sw_has_apx ) printf( " APX" );
} }
else if ( sw_has_aarch64 ) else if ( sw_has_aarch64 )
{ {
@@ -3760,10 +3721,10 @@ int main(int argc, char *argv[])
#if defined(_WIN32_WINNT) #if defined(_WIN32_WINNT)
if (opt_debug) if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT ); applog( LOG_INFO, "_WIN32_WINNT = 0x%04x", _WIN32_WINNT );
#else #else
if (opt_debug) if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT undefined." ); applog( LOG_INFO, "_WIN32_WINNT undefined." );
#endif #endif
#if defined(WINDOWS_CPU_GROUPS_ENABLED) #if defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) ) if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )

View File

@@ -542,7 +542,9 @@ void applog_hash(void *hash);
void format_hashrate(double hashrate, char *output); void format_hashrate(double hashrate, char *output);
void print_hash_tests(void); void print_hash_tests(void);
// Factors of 1000 used for hashes, ie kH/s, Mh/s.
void scale_hash_for_display ( double* hashrate, char* units ); void scale_hash_for_display ( double* hashrate, char* units );
// Factors of 1024 used for bytes, ie kiB, MiB.
void format_number_si( double* hashrate, char* si_units ); void format_number_si( double* hashrate, char* si_units );
void report_summary_log( bool force ); void report_summary_log( bool force );
@@ -582,6 +584,8 @@ enum algos {
ALGO_ANIME, ALGO_ANIME,
ALGO_ARGON2D250, ALGO_ARGON2D250,
ALGO_ARGON2D500, ALGO_ARGON2D500,
ALGO_ARGON2D1000,
ALGO_ARGON2D16000,
ALGO_ARGON2D4096, ALGO_ARGON2D4096,
ALGO_AXIOM, ALGO_AXIOM,
ALGO_BLAKE, ALGO_BLAKE,
@@ -677,6 +681,8 @@ static const char* const algo_names[] = {
"anime", "anime",
"argon2d250", "argon2d250",
"argon2d500", "argon2d500",
"argon2d1000",
"argon2d16000",
"argon2d4096", "argon2d4096",
"axiom", "axiom",
"blake", "blake",
@@ -837,6 +843,8 @@ Options:\n\
anime Animecoin (ANI)\n\ anime Animecoin (ANI)\n\
argon2d250\n\ argon2d250\n\
argon2d500\n\ argon2d500\n\
argon2d1000\n\
argon2d16000\n\
argon2d4096\n\ argon2d4096\n\
axiom Shabal-256 MemoHash\n\ axiom Shabal-256 MemoHash\n\
blake blake256r14 (SFR)\n\ blake blake256r14 (SFR)\n\

View File

@@ -137,10 +137,24 @@
#define v128_unpackhi8 _mm_unpackhi_epi8 #define v128_unpackhi8 _mm_unpackhi_epi8
// AES // AES
// Nokey means nothing on x86_64 but it saves an instruction and a register
// on ARM. // xor key with result after encryption, x86_64 format.
#define v128_aesenc _mm_aesenc_si128 #define v128_aesencxor _mm_aesenc_si128
// default is x86_64 format.
#define v128_aesenc v128_aesencxor
// xor key with v before encryption, arm64 format.
#define v128_xoraesenc( v, k ) \
_mm_aesenc_si128( v128_xor( v, k ), v128_zero )
// xor v with k_in before encryption then xor the result with k_out afterward.
// Uses the applicable optimization based on the target.
#define v128_xoraesencxor( v, k_in, k_out ) \
_mm_aesenc_si128( v128_xor( v, k_in ), k_out )
// arm64 optimized
#define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero ) #define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero )
#define v128_aesenclast _mm_aesenclast_si128 #define v128_aesenclast _mm_aesenclast_si128
#define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero ) #define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero )
#define v128_aesdec _mm_aesdec_si128 #define v128_aesdec _mm_aesdec_si128
@@ -433,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 ) #define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b // ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) #define v128_nxor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else #else
@@ -455,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) #define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) ) #define v128_nxor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#endif #endif

View File

@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) #define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b // ~( a ^ b ), same as (~a) ^ b
#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 ) #define mm256_nxor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#else #else
@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) \ #define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) ) _mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \ #define mm256_nxor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) ) mm256_not( _mm256_xor_si256( a, b ) )
#endif #endif
@@ -217,7 +217,9 @@ static inline __m256i mm256_not( const __m256i v )
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask. // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements. // Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test. // Effectively a sign test.
// The functions return int which can promote small integers to int when used
// in an expression. Users should mask the slack bits strategically to maintain
// data integrity.
#define mm256_movmask_64( v ) \ #define mm256_movmask_64( v ) \
_mm256_movemask_pd( _mm256_castsi256_pd( v ) ) _mm256_movemask_pd( _mm256_castsi256_pd( v ) )

View File

@@ -14,18 +14,17 @@
// vectors. It is therefore not technically required for any 512 bit vector // vectors. It is therefore not technically required for any 512 bit vector
// utilities defined below. // utilities defined below.
// if avx10 // avx512 is always set
// if evex512: yes
// else if avx512 : yes // avx512 is set but not avx10
// else : no // avx512 not set or avx10.1 is set without evex512
#if defined(SIMD512) #if defined(SIMD512)
// AVX512 intrinsics have a few changes from previous conventions. // AVX512 intrinsics have a few changes from previous conventions.
// //
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask. // "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction. // This removes the need for an explicit movemask instruction. It is also
// slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
// vector result resulting in a double penalty if a vector result is needed:
// slower cmp instruction & extra instruction to convert bit mask into
// vector mask. 256 bit & 128 bit still have legacy cmp returning vector
// while AVX512VL adds masked versions returning bit mask.
// //
// Many previously sizeless (si) instructions now have sized (epi) versions // Many previously sizeless (si) instructions now have sized (epi) versions
// to accomodate masking packed elements. // to accomodate masking packed elements.
@@ -36,7 +35,7 @@
// list. // list.
// //
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other // "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
// AVX512 permutes can cross all lanes. // AVX512 instructions using the permute name can cross all lanes.
// //
// New alignr instructions for epi64 and epi32 operate across the entire // New alignr instructions for epi64 and epi32 operate across the entire
// vector but slower than epi8 which continues to be restricted to 128 bit // vector but slower than epi8 which continues to be restricted to 128 bit
@@ -56,16 +55,17 @@
// parentheses to ensure the expression argument is evaluated first. // parentheses to ensure the expression argument is evaluated first.
// - if an argument is to referenced multiple times a C inline function // - if an argument is to referenced multiple times a C inline function
// should be used instead of a macro to prevent an expression argument // should be used instead of a macro to prevent an expression argument
// from being evaluated multiple times (wasteful) or produces side // from being evaluated multiple times (wasteful) or produce side
// effects (very bad). // effects (very bad).
// //
// There are 2 areas where overhead is a major concern: constants and // There are 2 areas where overhead is a major concern: constants and
// permutations. // permutations.
// //
// Constants need to be composed at run time by assembling individual // Constants need to be composed at run time by assembling individual
// elements, very expensive. The cost is proportional to the number of // elements or loaded from memory, very expensive. The cost of runtime
// different elements therefore use the largest element size possible, // construction is proportional to the number of different elements
// merge smaller integer elements to 64 bits, and group repeated elements. // therefore use the largest element size possible merging smaller integer
// elements to 64 bits, and group repeated elements.
// //
// Constants with repeating patterns can be optimized with the smaller // Constants with repeating patterns can be optimized with the smaller
// patterns repeated more frequently being more efficient. // patterns repeated more frequently being more efficient.
@@ -73,14 +73,15 @@
// Some specific constants can be very efficient. Zero is very efficient, // Some specific constants can be very efficient. Zero is very efficient,
// 1 and -1 slightly less so. // 1 and -1 slightly less so.
// //
// If an expensive constant is to be reused in the same function it should // If an expensive constant is to be reused in the same function it may
// be declared as a local variable defined once and reused. // be declared as a local variable defined once and reused. If frequently
// used it can be declared as a static const in memory.
// //
// Permutations can be very expensive if they use a vector control index, // Permutations can be very expensive if they use a vector control index,
// even if the permutation itself is quite efficient. // even if the permute instruction itself is quite efficient.
// The index is essentially a constant with all the baggage that brings. // The index is essentially a vector constant with all the baggage that
// The same rules apply, if an index is to be reused it should be defined // brings. The same rules apply, if an index is to be reused it should either
// as a local. This applies specifically to bswap operations. // be defined as a local or static const.
// //
// Permutations that cross 128 bit lanes are typically slower and often need // Permutations that cross 128 bit lanes are typically slower and often need
// a vector control index. If the permutation doesn't need to cross 128 bit // a vector control index. If the permutation doesn't need to cross 128 bit
@@ -227,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 ) #define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), (~a) ^ b // ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 ) #define mm512_nxor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
// ~( a & b ) // ~( a & b )
#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef ) #define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -247,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_ror_32 _mm512_ror_epi32 #define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32 #define mm512_rol_32 _mm512_rol_epi32
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_ror_16( v, c ) _mm512_shrdi_epi16( c, v, v )
#define mm512_rol_16( v, c ) _mm512_shldi_epi16( c, v, v )
#endif
*/
// //
// Reverse byte order of packed elements, vectorized endian conversion. // Reverse byte order of packed elements, vectorized endian conversion.
@@ -255,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 ) #define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used /* not used
#if defined(__AVX512VBMI2__)
#define mm512_bswap_16( v ) mm512_ror_16( v, 8 )
#else
#define mm512_bswap_16( v ) \ #define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \ _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
#endif
*/ */
#define mm512_bswap_16( v ) \ #define mm512_bswap_16( v ) \
@@ -437,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) ); _mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes // 64 bit lanes
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE. // Redundant with ror & rol but included for consistency with AVX2/SSE.
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 ) #define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered #define mm512_swap64_32 mm512_qrev32 // grandfathered

View File

@@ -4,9 +4,10 @@
#if defined(__aarch64__) && defined(__ARM_NEON) #if defined(__aarch64__) && defined(__ARM_NEON)
// Targeted functions supporting NEON SIMD 128 & 64 bit vectors. // Targeted functions supporting NEON SIMD 128 & 64 bit vectors.
// Element size matters!
// //
// Intel naming is generally used. // Intel style naming is generally used, however, this not an attempt to emulate Intel
// intructions. It's focussed on the functions used in this program and the best way
// to implement them with NEON.
// //
// Some advanced logical operations that require SHA3. Prior to GCC-13 // Some advanced logical operations that require SHA3. Prior to GCC-13
// they also require armv8.2 // they also require armv8.2
@@ -125,7 +126,7 @@
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) #define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( v1 ^ v0 ), same as (~v1) ^ v0 // ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) #define v128_nxor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64 // ~v1 | v0, args reversed for consistency with x86_64
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) #define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
@@ -186,9 +187,21 @@
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version. // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
// AES // AES
// consistent with Intel AES intrinsics, break up for optimizing
#define v128_aesenc( v, k ) \ // xor key with result after encryption, x86_64 format.
v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) ) #define v128_aesencxor( v, k ) \
v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k )
// default is x86_64 format.
#define v128_aesenc v128_aesencxor
// xor key with v before encryption, arm64 format.
#define v128_xoraesenc( v, k ) \
vaesmcq_u8( vaeseq_u8( v, k ) )
// xor v with k_in before encryption then xor the result with k_out afterward.
// Uses the applicable optimization based on the target.
#define v128_xoraesencxor( v, k_in, k_out ) \
v128_xor( v128_xoraesenc( v, k_in ), k_out )
#define v128_aesenc_nokey( v ) \ #define v128_aesenc_nokey( v ) \
vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
@@ -336,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation // Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \ #define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \

View File

@@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
} }
/* /*
// ARM feature compiler flags
#ifdef __aarch64__ #ifdef __aarch64__
#warning "__aarch64__" #warning "__aarch64__"
#endif #endif
@@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#endif #endif
*/ */
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
// Flags:
// AVX10 128 256 512
// 0 0 0 0 = AVX10 not supported
// 1 1 1 0 = AVX10 256 bit max (version 2)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
static inline bool cpu_arch_x86_64() static inline bool cpu_arch_x86_64()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -766,6 +757,17 @@ static inline bool has_vbmi2()
#endif #endif
} }
static inline bool has_amx()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
#else
return false;
#endif
}
static inline bool has_aes() static inline bool has_aes()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -815,12 +817,9 @@ static inline bool has_sveaes()
static inline bool has_sha256() static inline bool has_sha256()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 }; unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info ); cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag; return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false; return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA2) #elif defined(__aarch64__) && defined(HWCAP_SHA2)
// NEON SHA256 // NEON SHA256
@@ -835,7 +834,7 @@ static inline bool has_sha256()
static inline bool has_sha512() static inline bool has_sha512()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
if ( has_avx2() ) if ( has_avx() )
{ {
unsigned int cpu_info[4] = { 0 }; unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info ); cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +851,6 @@ static inline bool has_sha512()
#endif #endif
} }
// Arm only
static inline bool has_sha3() static inline bool has_sha3()
{ {
#if defined(__aarch64__) && defined(HWCAP_SHA3) #if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +942,6 @@ static inline int sve_vector_length()
return 0; return 0;
} }
// Assume min_vlen refers to the register size
static inline int rvv_vector_length()
{
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}
// generic
static inline int vector_length() static inline int vector_length()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -965,8 +953,8 @@ static inline int vector_length()
return has_sve() ? sve_vector_length() return has_sve() ? sve_vector_length()
: has_neon() ? 128 : has_neon() ? 128
: 0; : 0;
#elif defined(__riscv) && defined(__riscv_vector) #elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return rvv_vector_length(); return __riscv_v_min_vlen;
#endif #endif
return 0; return 0;
} }

49
util.c
View File

@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
free(cmd); free(cmd);
} }
// Decimal SI, factors 0f 1000
void format_hashrate(double hashrate, char *output) void scale_hash_for_display ( double* hashrate, char* prefix )
{ {
char prefix = '\0'; if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
if (hashrate < 10000) { else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
// nop else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
} else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if (hashrate < 1e7) { else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
prefix = 'k'; else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
hashrate *= 1e-3; else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
} else { *prefix = 'Y'; *hashrate /= 1e24; }
else if (hashrate < 1e10) {
prefix = 'M';
hashrate *= 1e-6;
}
else if (hashrate < 1e13) {
prefix = 'G';
hashrate *= 1e-9;
}
else {
prefix = 'T';
hashrate *= 1e-12;
}
sprintf(
output,
prefix ? "%.2f %cH/s" : "%.2f H/s%c",
hashrate, prefix
);
} }
// For use with MiB etc void format_hashrate( double hashrate, char *output )
{
char prefix = '\0';
scale_hash_for_display( &hashrate, &prefix );
sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix );
}
// Binary SI, factors of 1024
void format_number_si( double* n, char* si_units ) void format_number_si( double* n, char* si_units )
{ {
if ( *n < 1024*10 ) { *si_units = 0; return; } if ( *n < 1024*10 ) { *si_units = 0; return; }

View File

@@ -55,9 +55,9 @@ the current directory it will be created.
Data file creation can take up to 30 minutes on a spinning hard drive. Data file creation can take up to 30 minutes on a spinning hard drive.
Once created the new data file will be verified and used immediately Once created the new data file will be verified and used immediately
if a valid url and user were included on the command line. if a valid url and user was included on the command line.
A default data file can be created by ommitting the url option. That will A default data file can also be created by ommitting the url option. That will
either verify an existing default data file or create one and verify it, either verify an existing default data file or create one and verify it,
then exit. then exit.