Compare commits

...

4 Commits

Author SHA1 Message Date
Jay D Dee
a52c5eccf7 v3.9.10 2019-11-22 20:29:18 -05:00
Jay D Dee
86b889e1b0 v3.9.9.1 2019-10-24 14:11:26 -04:00
Jay D Dee
72330eb5a7 v3.9.9 2019-10-10 19:58:34 -04:00
Jay D Dee
789c8b70bc v3.9.8.1 2019-10-01 14:17:36 -04:00
97 changed files with 6827 additions and 2921 deletions

View File

@@ -282,7 +282,9 @@ cpuminer_SOURCES = \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/sha256_p.c \
algo/yespower/yespower-opt.c

View File

@@ -92,6 +92,7 @@ Supported Algorithms
phi2-lux identical to phi2
pluck Pluck:128 (Supcoin)
polytimos Ninja
power2b MicroBitcoin (MBC)
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
@@ -121,10 +122,10 @@ Supported Algorithms
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin (RVN) (original algo)
x16rv2 Ravencoin (RVN) (new algo)
x16r
x16rv2 Ravencoin (RVN)
x16rt Gincoin (GIN)
x16rt_veil Veil (VEIL)
x16rt-veil Veil (VEIL)
x16s Pigeoncoin (PGN)
x17
x21s
@@ -135,6 +136,7 @@ Supported Algorithms
yescryptr32 WAVI
yespower Cryply
yespowerr16 Yenten (YTN)
yespower-b2b generic yespower + blake2b
zr5 Ziftr
Errata
@@ -158,10 +160,12 @@ Bugs
----
Users are encouraged to post their bug reports using git issues or on the
Bitcoin Talk forum at:
Bitcoin Talk forum or opening an issue in git:
https://bitcointalk.org/index.php?topic=1326803.0
https://github.com/JayDDee/cpuminer-opt/issues
All problem reports must be accompanied by a proper problem definition.
This should include how the problem occurred, the command line and
output from the miner showing the startup messages and any errors.
@@ -173,10 +177,6 @@ Donations
cpuminer-opt has no fees of any kind but donations are accepted.
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Happy mining!

View File

@@ -1,14 +1,6 @@
cpuminer-opt is a console program run from the command line using the
keyboard, not the mouse.
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
This feature requires recent SW including GCC version 5 or higher and
openssl version 1.1 or higher. It may also require using "-march=znver1"
compile flag.
cpuminer-opt is a console program, if you're using a mouse you're doing it
wrong.
Security warning
----------------
@@ -34,10 +26,50 @@ Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple and Android are not supported.
FreeBSD YMMV.
Change Log
----------
v3.9.10
Faster X* algos with AVX2.
Small improvements to summary stats report.
v3.9.9.1
Fixed a day1 bug that could cause the miner to idle for up to 2 minutes
under certain circumstances.
Redesigned summary stats report now includes session statistics.
More robust handling of statistics to reduce corruption.
Removed --hide-diff option.
Better handling of cpu-affinity with more than 64 CPUs.
v3.9.9
Added power2b algo for MicroBitcoin.
Added generic yespower-b2b (yespower + blake2b) algo to be used with
the parameters introduced in v3.9.7 for yespower & yescrypt.
Display additional info when a share is rejected.
Some low level enhancements and minor tweaking of log output.
RELEASE_NOTES (this file) and README.md added to Windows release package.
v3.9.8.1
Summary log report will be generated on stratum diff change or after 5 minutes,
whichever comes first, to prevent incorrect data in the report.
Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
Until the fork Luxcoin can be mined using phi2 algo.
--hide-diff option is deprecated and has no effect. It will be removed in a
future release.
v3.9.8
Changes to log output to provide data more relevant to actual mining

View File

@@ -116,8 +116,6 @@ void init_algo_gate( algo_gate_t* gate )
gate->get_nonceptr = (void*)&std_get_nonceptr;
gate->work_decode = (void*)&std_le_work_decode;
gate->decode_extra_data = (void*)&do_nothing;
gate->wait_for_diff = (void*)&std_wait_for_diff;
gate->get_max64 = (void*)&get_max64_0x1fffffLL;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
@@ -204,6 +202,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_PHI2: register_phi2_algo ( gate ); break;
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
case ALGO_QUARK: register_quark_algo ( gate ); break;
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
@@ -251,6 +250,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
@@ -276,7 +276,7 @@ bool register_json_rpc2( algo_gate_t *gate )
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
gate->wait_for_diff = (void*)&do_nothing;
// gate->wait_for_diff = (void*)&do_nothing;
gate->get_new_work = (void*)&jr2_get_new_work;
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
gate->stratum_gen_work = (void*)&jr2_stratum_gen_work;
@@ -337,7 +337,6 @@ const char* const algo_alias_map[][2] =
{ "myriad", "myr-gr" },
{ "neo", "neoscrypt" },
{ "phi", "phi1612" },
{ "phi2-lux", "phi2" },
{ "sib", "x11gost" },
{ "timetravel8", "timetravel" },
{ "veil", "x16rt-veil" },

View File

@@ -35,7 +35,7 @@
// 6. Determine if other non existant functions are required.
// That is determined by the need to add code in cpu-miner.c
// that applies only to the new algo. That is forbidden. All
// algo specific code must be in theh algo's file.
// algo specific code must be in the algo's file.
//
// 7. If new functions need to be added to the gate add the type
// to the structure, declare a null instance in this file and define
@@ -48,10 +48,10 @@
// instances as they are defined by default, or unsafe functions that
// are not needed by the algo.
//
// 9. Add an case entry to the switch/case in function register_gate
// 9. Add a case entry to the switch/case in function register_gate
// in file algo-gate-api.c for the new algo.
//
// 10 If a new function type was defined add an entry to ini talgo_gate
// 10 If a new function type was defined add an entry to init algo_gate
// to initialize the new function to its null instance described in step 7.
//
// 11. If the new algo has aliases add them to the alias array in
@@ -85,14 +85,16 @@
typedef uint32_t set_t;
#define EMPTY_SET 0
#define SSE2_OPT 1
#define AES_OPT 2
#define SSE42_OPT 4
#define AVX_OPT 8
#define AVX2_OPT 0x10
#define SHA_OPT 0x20
#define AVX512_OPT 0x40
#define EMPTY_SET 0
#define SSE2_OPT 1
#define AES_OPT 2
#define SSE42_OPT 4
#define AVX_OPT 8 // Sandybridge
#define AVX2_OPT 0x10 // Haswell
#define SHA_OPT 0x20 // sha256 (Ryzen, Ice Lake)
#define AVX512_OPT 0x40 // AVX512- F, VL, DQ, BW (Skylake-X)
#define VAES_OPT 0x80 // VAES (Ice Lake)
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -108,14 +110,7 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{
// special case, only one target, provides a callback for scanhash to
// submit work with less overhead.
// bool (*submit_work ) ( struct thr_info*, const struct work* );
// mandatory functions, must be overwritten
// Added a 5th arg for the thread_info structure to replace the int thr id
// in the first arg. Both will co-exist during the trasition.
//int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
// optional unsafe, must be overwritten if algo uses function
@@ -123,27 +118,55 @@ void ( *hash ) ( void*, const void*, uint32_t ) ;
void ( *hash_suw ) ( void*, const void* );
//optional, safe to use default in most cases
// Allocate thread local buffers and other initialization specific to miner
// threads.
bool ( *miner_thread_init ) ( int );
// Generate global blockheader from stratum data.
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
// Get thread local copy of blockheader with unique nonce.
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
bool );
// Return pointer to nonce in blockheader.
uint32_t *( *get_nonceptr ) ( uint32_t* );
void ( *decode_extra_data ) ( struct work*, uint64_t* );
void ( *wait_for_diff ) ( struct stratum_ctx* );
int64_t ( *get_max64 ) ();
// Decode getwork blockheader
bool ( *work_decode ) ( const json_t*, struct work* );
// Extra getwork data
void ( *decode_extra_data ) ( struct work*, uint64_t* );
bool ( *submit_getwork_result ) ( CURL*, struct work* );
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
// Increment extranonce
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
uint32_t*, uint32_t, uint32_t );
uint32_t*, uint32_t, uint32_t );
// Build mining.submit message
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
char* ( *malloc_txs_request ) ( struct work* );
// Big or little
void ( *set_work_data_endian ) ( struct work* );
double ( *calc_network_diff ) ( struct work* );
// Wait for first work
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
void ( *resync_threads ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( struct work* );
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
bool ( *stratum_handle_response )( json_t* );
set_t optimizations;
@@ -198,8 +221,6 @@ void null_hash_suw();
// optional safe targets, default listed first unless noted.
void std_wait_for_diff();
uint32_t *std_get_nonceptr( uint32_t *work_data );
uint32_t *jr2_get_nonceptr( uint32_t *work_data );
@@ -214,21 +235,13 @@ void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
// pick your favorite or define your own
int64_t get_max64_0x1fffffLL(); // default
int64_t get_max64_0x40LL();
int64_t get_max64_0x3ffff();
int64_t get_max64_0x3fffffLL();
int64_t get_max64_0x1ffff();
int64_t get_max64_0xffffLL();
bool std_le_work_decode( const json_t *val, struct work *work );
bool std_be_work_decode( const json_t *val, struct work *work );
bool jr2_work_decode( const json_t *val, struct work *work );
bool jr2_work_decode( const json_t *val, struct work *work );
bool std_le_submit_getwork_result( CURL *curl, struct work *work );
bool std_be_submit_getwork_result( CURL *curl, struct work *work );
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
void std_le_build_stratum_request( char *req, struct work *work );
void std_be_build_stratum_request( char *req, struct work *work );
@@ -242,8 +255,8 @@ void set_work_data_big_endian( struct work *work );
double std_calc_network_diff( struct work *work );
void std_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_root,
uint32_t ntime, uint32_t nbits );
uint32_t *prevhash, uint32_t *merkle_root,
uint32_t ntime, uint32_t nbits );
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
@@ -264,8 +277,8 @@ int std_get_work_data_size();
// by calling the algo's register function.
bool register_algo_gate( int algo, algo_gate_t *gate );
// Override any default gate functions that are applicable and do any other
// algo-specific initialization.
// Called by algos toverride any default gate functions that are applicable
// and do any other algo-specific initialization.
// The register functions for all the algos can be declared here to reduce
// compiler warnings but that's just more work for devs adding new algos.
bool register_algo( algo_gate_t *gate );
@@ -278,5 +291,7 @@ bool register_json_rpc2( algo_gate_t *gate );
// use this to call the hash function of an algo directly, ie util.c test.
void exec_hash_function( int algo, void *output, const void *pdata );
void get_algo_alias( char** algo_or_alias );
// Validate a string as a known algo and alias, updates arg to proper
// algo name if valid alias, NULL if invalid alias or algo.
void get_algo_alias( char **algo_or_alias );

View File

@@ -74,18 +74,12 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
return 0;
}
int64_t argon2_get_max64 ()
{
return 0x1ffLL;
}
bool register_argon2_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_argon2;
gate->hash = (void*)&argon2hash;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->get_max64 = (void*)&argon2_get_max64;
opt_target_factor = 65536.0;
return true;

View File

@@ -179,12 +179,9 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t get_max64_0x1ff() { return 0x1ff; }
bool register_argon2d4096_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d4096;
gate->get_max64 = (void*)&get_max64_0x1ff;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;

View File

@@ -1,18 +1,8 @@
#include "blake-gate.h"
int64_t blake_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&blake_get_max64;
//#if defined (__AVX2__) && defined (FOUR_WAY)
// gate->optimizations = SSE2_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#if defined(BLAKE_4WAY)
four_way_not_tested();
gate->scanhash = (void*)&scanhash_blake_4way;

View File

@@ -1,13 +1,5 @@
#include "blake2b-gate.h"
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
*/
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_4WAY)
@@ -17,7 +9,6 @@ bool register_blake2b_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
// gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = AVX2_OPT;
return true;
};

View File

@@ -1,12 +1,5 @@
#include "blake2s-gate.h"
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_8WAY)
@@ -19,7 +12,6 @@ bool register_blake2s_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = SSE2_OPT | AVX2_OPT;
return true;
};

View File

@@ -70,18 +70,3 @@ int scanhash_blake2s( struct work *work,
return 0;
}
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake2s_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
gate->get_max64 = (void*)&blake2s_get_max64;
return true;
};
*/

View File

@@ -403,7 +403,9 @@ static const sph_u64 CB[16] = {
__m256i M[16]; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
unsigned r; \
const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -412,53 +414,53 @@ static const sph_u64 CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
} while (0)
#else
@@ -491,8 +493,7 @@ static const sph_u64 CB[16] = {
m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB7 ) ); \
shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -620,7 +621,7 @@ blake64_4way_close( blake_4way_big_context *sc,
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
buf[ptr>>3] = _mm256_set1_epi64x( zz );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )

View File

@@ -1,13 +1,6 @@
#include "blakecoin-gate.h"
#include <memory.h>
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
// return 0x3fffffLL;
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
@@ -23,7 +16,6 @@ bool register_vanilla_algo( algo_gate_t* gate )
gate->hash = (void*)&blakecoinhash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}

View File

@@ -93,33 +93,3 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
return 0;
}
/*
void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
{
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
}
*/
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
gate->get_max64 = (void*)&blakecoin_get_max64;
blakecoin_init( &blake_init_ctx );
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}
*/

View File

@@ -38,7 +38,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
if (!have_longpoll && work->height > *net_blocks + 1)
{
char netinfo[64] = { 0 };
if (opt_showdiff && net_diff > 0.)
if ( net_diff > 0. )
{
if (net_diff != work->targetdiff)
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
@@ -116,7 +116,7 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
// block header suffix from coinb2 (stake version)
memcpy( &g_work->data[44],
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
sctx->bloc_height = g_work->data[32];
sctx->block_height = g_work->data[32];
//applog_hex(work->data, 180);
//applog_hex(&work->data[36], 36);
}
@@ -154,7 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
#endif
gate->optimizations = AVX2_OPT;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->decode_extra_data = (void*)&decred_decode_extradata;
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;

View File

@@ -143,7 +143,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
if (!have_longpoll && work->height > *net_blocks + 1)
{
char netinfo[64] = { 0 };
if (opt_showdiff && net_diff > 0.)
if (net_diff > 0.)
{
if (net_diff != work->targetdiff)
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
@@ -269,7 +269,6 @@ bool register_decred_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_decred;
gate->hash = (void*)&decred_hash;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->decode_extra_data = (void*)&decred_decode_extradata;
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;

View File

@@ -10,7 +10,6 @@ bool register_pentablake_algo( algo_gate_t* gate )
gate->hash = (void*)&pentablakehash;
#endif
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -1,11 +1,8 @@
#include "bmw512-gate.h"
int64_t bmw512_get_max64() { return 0x7ffffLL; }
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&bmw512_get_max64;
opt_target_factor = 256.0;
#if defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;

View File

@@ -363,7 +363,6 @@ bool register_cryptolight_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_cryptolight;
gate->hash = (void*)&cryptolight_hash;
gate->hash_suw = (void*)&cryptolight_hash;
gate->get_max64 = (void*)&get_max64_0x40LL;
return true;
};

View File

@@ -111,7 +111,6 @@ bool register_cryptonight_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_cryptonight;
gate->hash = (void*)&cryptonight_hash;
gate->hash_suw = (void*)&cryptonight_hash_suw;
gate->get_max64 = (void*)&get_max64_0x40LL;
return true;
};
@@ -123,7 +122,6 @@ bool register_cryptonightv7_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_cryptonight;
gate->hash = (void*)&cryptonight_hash;
gate->hash_suw = (void*)&cryptonight_hash_suw;
gate->get_max64 = (void*)&get_max64_0x40LL;
return true;
};

View File

@@ -7,7 +7,7 @@
// 2x128
/*
// The result of hashing 10 rounds of initial data which consists of params
// zero padded.
static const uint64_t IV256[] =
@@ -25,7 +25,7 @@ static const uint64_t IV512[] =
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
*/
static void transform_2way( cube_2way_context *sp )
{
@@ -97,39 +97,30 @@ static void transform_2way( cube_2way_context *sp )
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
int blockbytes )
{
__m128i* h = (__m128i*)sp->h;
__m256i *h = (__m256i*)sp->h;
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
: (__m128i*)IV256 );
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
if ( hashbitlen == 512 )
{
h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
}
else
{
h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
}
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 3] = m256_const1_128( iv[3] );
h[ 4] = m256_const1_128( iv[4] );
h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m256_const1_128( iv[6] );
h[ 7] = m256_const1_128( iv[7] );
h[ 0] = m256_const1_128( iv[0] );
h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 3] = m256_const1_128( iv[3] );
h[ 4] = m256_const1_128( iv[4] );
h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m256_const1_128( iv[6] );
h[ 7] = m256_const1_128( iv[7] );
return 0;
}
@@ -164,11 +155,11 @@ int cube_2way_close( cube_2way_context *sp, void *output )
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
m256_const2_64( 0, 0x0000000000000080 ) );
transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7],
_mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) );
m256_const2_64( 0x0000000100000000, 0 ) );
for ( i = 0; i < 10; ++i ) transform_2way( sp );
@@ -197,13 +188,13 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
m256_const2_64( 0, 0x0000000000000080 ) );
transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
1,0,0,0 ) );
sp->h[7] = _mm256_xor_si256( sp->h[7],
m256_const2_64( 0x0000000100000000, 0 ) );
for ( i = 0; i < 10; ++i ) transform_2way( sp );
for ( i = 0; i < 10; ++i ) transform_2way( sp );
memcpy( hash, sp->h, sp->hashlen<<5 );
return 0;

View File

@@ -100,7 +100,6 @@ bool register_dmd_gr_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_groestl;
gate->hash = (void*)&groestlhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
opt_target_factor = 256.0;
return true;
};

View File

@@ -88,15 +88,3 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
/*
bool register_myriad_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT;
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriadhash;
// gate->hash_alt = (void*)&myriadhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

View File

@@ -12,7 +12,6 @@ bool register_myriad_algo( algo_gate_t* gate )
gate->hash = (void*)&myriad_hash;
#endif
gate->optimizations = AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -32,8 +32,6 @@
#include <stddef.h>
#include <string.h>
//#include "miner.h"
#include "hamsi-hash-4way.h"
#if defined(__AVX2__)
@@ -100,7 +98,7 @@ extern "C"{
#endif
//#include "hamsi-helper-4way.c"
/*
static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
@@ -109,7 +107,7 @@ static const sph_u32 IV512[] = {
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d)
};
*/
static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
@@ -138,6 +136,7 @@ static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
};
// imported from hamsi helper
/* Note: this table lists bits within each byte from least
@@ -529,49 +528,34 @@ static const sph_u32 T512[64][16] = {
SPH_C32(0xe7e00a94) }
};
#define INPUT_BIG \
do { \
const __m256i zero = _mm256_setzero_si256(); \
__m256i db = *buf; \
const sph_u32 *tp = &T512[0][0]; \
m0 = zero; \
m1 = zero; \
m2 = zero; \
m3 = zero; \
m4 = zero; \
m5 = zero; \
m6 = zero; \
m7 = zero; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
m256_const1_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
m256_const1_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
m256_const1_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
m256_const1_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
m256_const1_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
m256_const1_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
tp += 0x10; \
m256_const1_64( tp[7] ) ) ); \
tp += 8; \
db = _mm256_srli_epi64( db, 1 ); \
} \
} while (0)
@@ -662,55 +646,39 @@ do { \
#define ROUND_BIG(rc, alpha) \
do { \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, m256_const1_64( \
( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
s1 = _mm256_xor_si256( s1, m256_const1_64( \
( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
s2 = _mm256_xor_si256( s2, m256_const1_64( \
( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
s3 = _mm256_xor_si256( s3, m256_const1_64( \
( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
s4 = _mm256_xor_si256( s4, m256_const1_64( \
( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
s5 = _mm256_xor_si256( s5, m256_const1_64( \
( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
s6 = _mm256_xor_si256( s6, m256_const1_64( \
( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
s7 = _mm256_xor_si256( s7, m256_const1_64( \
( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
s8 = _mm256_xor_si256( s8, m256_const1_64( \
( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
s9 = _mm256_xor_si256( s9, m256_const1_64( \
( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
sA = _mm256_xor_si256( sA, m256_const1_64( \
( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
sB = _mm256_xor_si256( sB, m256_const1_64( \
( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
sC = _mm256_xor_si256( sC, m256_const1_64( \
( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
sD = _mm256_xor_si256( sD, m256_const1_64( \
( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
sE = _mm256_xor_si256( sE, m256_const1_64( \
( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
sF = _mm256_xor_si256( sF, m256_const1_64( \
( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
\
SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \
@@ -864,47 +832,22 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
void hamsi512_4way_init( hamsi_4way_big_context *sc )
{
sc->partial_len = 0;
sph_u32 lo, hi;
sc->count_high = sc->count_low = 0;
for ( int i = 0; i < 8; i++ )
{
lo = 2*i;
hi = 2*i + 1;
sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
}
sc->h[0] = m256_const1_64( 0x6c70617273746565 );
sc->h[1] = m256_const1_64( 0x656e62656b204172 );
sc->h[2] = m256_const1_64( 0x302c206272672031 );
sc->h[3] = m256_const1_64( 0x3434362c75732032 );
sc->h[4] = m256_const1_64( 0x3030312020422d33 );
sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
}
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
// It looks like the only way to get in here is if core was previously called
// with a very small len
// That's not likely even with 80 byte input so deprecate partial len
/*
if ( sc->partial_len != 0 )
{
size_t mlen;
mlen = 8 - sc->partial_len;
if ( len < mlen )
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
sc->partial_len += len;
return;
}
else
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
len -= mlen;
vdata += mlen>>3;
hamsi_big( sc, sc->partial, 1 );
sc->partial_len = 0;
}
}
*/
hamsi_big( sc, vdata, len>>3 );
vdata += ( (len& ~(size_t)7) >> 3 );
len &= (size_t)7;
@@ -920,8 +863,9 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
0UL, 0x80UL, 0UL, 0x80UL );
sc->buf[0] = m256_const1_64( 0x80 );
// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
// 0UL, 0x80UL, 0UL, 0x80UL );
hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad );

View File

@@ -94,7 +94,7 @@ extern "C"{
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
__m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \

View File

@@ -1,12 +1,10 @@
#include "keccak-gate.h"
int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->get_max64 = (void*)&keccak_get_max64;
opt_target_factor = 128.0;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
@@ -22,7 +20,6 @@ bool register_keccakc_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->get_max64 = (void*)&keccak_get_max64;
opt_target_factor = 256.0;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;

View File

@@ -1,23 +1,3 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
@@ -26,31 +6,30 @@
#include "simd-utils.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \
#define MULT2( a0, a1, mask ) \
do { \
register __m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c,t)\
#define STEP_PART(x,c0,c1,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
ADD_CONSTANT(*x, *(x+4), c0, c1);
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\
@@ -245,7 +224,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0x00000000,0x00000000,0x00000000,0xfc053c31
};
__m256i CNS[32];
/***************************************************/
/* Round function */
@@ -258,6 +237,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
t0 = chainv[0];
t1 = chainv[1];
@@ -271,7 +251,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t0, t1 );
MULT2( t0, t1, MASK );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -290,66 +270,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
MULT2( chainv[0], chainv[1], MASK );
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
MULT2( chainv[2], chainv[3], MASK );
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
MULT2( chainv[4], chainv[5], MASK );
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
MULT2( chainv[6], chainv[7], MASK );
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
MULT2( chainv[8], chainv[9], MASK );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
MULT2( chainv[8], chainv[9]);
MULT2( chainv[8], chainv[9], MASK );
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
MULT2( chainv[6], chainv[7], MASK );
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
MULT2( chainv[4], chainv[5], MASK );
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
MULT2( chainv[2], chainv[3], MASK );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
MULT2( chainv[0], chainv[1], MASK );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
MULT2( msg0, msg1, MASK );
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1);
MULT2( msg0, msg1, MASK );
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1);
MULT2( msg0, msg1, MASK );
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1);
MULT2( msg0, msg1, MASK );
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1);
MULT2( msg0, msg1, MASK );
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) );
@@ -365,14 +345,14 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] );
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
@@ -380,25 +360,24 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
tmp[0], tmp[1] );
}
/***************************************************/
/* Finalization function */
/* state: hash context */
@@ -410,8 +389,9 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i zero[2];
zero[0] = zero[1] = _mm256_setzero_si256();
zero[0] = zero[1] = m256_zero;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/
rnd512_2way( state, zero );
@@ -433,8 +413,10 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
rnd512_2way( state, zero );
@@ -455,26 +437,27 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{
int i;
state->hashbitlen = hashbitlen;
for ( i=0; i<32; i++ ) CNS[i] =
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
for ( i=0; i<10; i++ ) state->chainv[i] =
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ],
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
__m128i *iv = (__m128i*)IV;
state->chainv[0] = m256_const1_128( iv[0] );
state->chainv[1] = m256_const1_128( iv[1] );
state->chainv[2] = m256_const1_128( iv[2] );
state->chainv[3] = m256_const1_128( iv[3] );
state->chainv[4] = m256_const1_128( iv[4] );
state->chainv[5] = m256_const1_128( iv[5] );
state->chainv[6] = m256_const1_128( iv[6] );
state->chainv[7] = m256_const1_128( iv[7] );
state->chainv[8] = m256_const1_128( iv[8] );
state->chainv[9] = m256_const1_128( iv[9] );
((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero;
@@ -492,13 +475,15 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
__m256i msg[2];
int i;
int blocks = (int)len >> 5;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state-> rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
rnd512_2way( state, msg );
}
@@ -507,9 +492,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
}
return 0;
}
@@ -525,8 +509,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
rnd512_2way( state, buffer );
else
{ // empty pad block, constant data
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
@@ -545,13 +528,16 @@ int luffa_2way_update_close( luffa_2way_context *state,
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
rnd512_2way( state, msg );
}
@@ -559,16 +545,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
if ( state->rembytes )
{
// padding of partial block
msg[0] = mm256_bswap_32( vdata[0] );
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
rnd512_2way( state, msg );
}
else
{
// empty pad block
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}

View File

@@ -541,7 +541,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = (__m256i*)state->chainv;
__m256i t;
const __m128i zero = _mm_setzero_si128();
const __m128i zero = m128_zero;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
rnd512( state, zero, zero );
@@ -555,7 +557,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -568,7 +572,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
}
#else

View File

@@ -127,7 +127,6 @@ bool register_lyra2z_algo( algo_gate_t* gate )
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};
@@ -147,15 +146,12 @@ bool register_lyra2h_algo( algo_gate_t* gate )
gate->hash = (void*)&lyra2h_hash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};
/////////////////////////////////
int64_t allium_get_max64_0xFFFFLL() { return 0xFFFFLL; }
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
@@ -168,7 +164,6 @@ bool register_allium_algo( algo_gate_t* gate )
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&allium_get_max64_0xFFFFLL;
opt_target_factor = 256.0;
return true;
};
@@ -214,7 +209,6 @@ bool register_phi2_algo( algo_gate_t* gate )
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
gate->build_extraheader = (void*)&phi2_build_extraheader;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
#if defined(PHI2_4WAY)
gate->scanhash = (void*)&scanhash_phi2_4way;

View File

@@ -113,18 +113,12 @@ int scanhash_lyra2re( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t lyra2re_get_max64 ()
{
return 0xffffLL;
}
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->get_max64 = (void*)&lyra2re_get_max64;
opt_target_factor = 128.0;
return true;
};

View File

@@ -113,17 +113,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
lyra2rev3_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
} while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -70,7 +70,6 @@ bool register_lyra2z330_algo( algo_gate_t* gate )
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z330;
gate->hash = (void*)&lyra2z330_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -263,10 +263,9 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
#if defined (__AVX2__)
register __m256i state0, state1, state2, state3;
const __m256i zero = m256_zero;
state0 = zero;
state1 = zero;
state0 =
state1 = m256_zero;
state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
@@ -290,12 +289,11 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
#elif defined (__SSE2__)
__m128i state0, state1, state2, state3, state4, state5, state6, state7;
const __m128i zero = m128_zero;
state0 = zero;
state1 = zero;
state2 = zero;
state3 = zero;
state0 =
state1 =
state2 =
state3 = m128_zero;
state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );

View File

@@ -296,8 +296,6 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
pdata[19] = n;
// can this be skipped after finding a share? Seems to work ok.
//out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);
mpf_set_prec_raw(mptmp, prec0);
@@ -323,7 +321,6 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
gate->get_max64 = (void*)&get_max64_0x1ffff;
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;

View File

@@ -208,12 +208,6 @@ void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
++(*nonceptr);
}
int64_t zr5_get_max64 ()
{
// return 0x1ffffLL;
return 0x1fffffLL;
}
void zr5_display_pok( struct work* work )
{
if ( work->data[0] & 0x00008000 )
@@ -229,7 +223,6 @@ bool register_zr5_algo( algo_gate_t* gate )
gate->get_new_work = (void*)&zr5_get_new_work;
gate->scanhash = (void*)&scanhash_zr5;
gate->hash = (void*)&zr5hash;
gate->get_max64 = (void*)&zr5_get_max64;
gate->decode_extra_data = (void*)&zr5_display_pok;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;

View File

@@ -94,8 +94,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
g_work->data[28] = 0x80000000;
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
@@ -112,7 +110,6 @@ bool register_lbry_algo( algo_gate_t* gate )
gate->hash = (void*)&lbry_hash;
#endif
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->get_max64 = (void*)&lbry_get_max64;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
// gate->build_block_header = (void*)&build_block_header;
gate->build_extraheader = (void*)&lbry_build_extraheader;

View File

@@ -1070,17 +1070,6 @@ int scanhash_neoscrypt( struct work *work,
return 0;
}
int64_t get_neoscrypt_max64() { return 0x3ffff; }
void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
{
while ( !stratum->job.diff )
{
// applog(LOG_DEBUG, "Waiting for Stratum to set the job difficulty");
sleep(1);
}
}
int neoscrypt_get_work_data_size () { return 80; }
bool register_neoscrypt_algo( algo_gate_t* gate )
@@ -1088,8 +1077,6 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_neoscrypt;
gate->hash = (void*)&neoscrypt;
gate->get_max64 = (void*)&get_neoscrypt_max64;
gate->wait_for_diff = (void*)&neoscrypt_wait_for_diff;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;

View File

@@ -483,11 +483,6 @@ int scanhash_pluck( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t pluck_get_max64 ()
{
return 0x1ffLL;
}
bool pluck_miner_thread_init( int thr_id )
{
scratchbuf = malloc( 128 * 1024 );
@@ -503,7 +498,6 @@ bool register_pluck_algo( algo_gate_t* gate )
gate->miner_thread_init = (void*)&pluck_miner_thread_init;
gate->scanhash = (void*)&scanhash_pluck;
gate->hash = (void*)&pluck_hash;
gate->get_max64 = (void*)&pluck_get_max64;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -766,8 +766,6 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t scrypt_get_max64() { return 0xfff; }
bool scrypt_miner_thread_init( int thr_id )
{
scratchbuf = scrypt_buffer_alloc( scratchbuf_size );
@@ -783,10 +781,8 @@ bool register_scrypt_algo( algo_gate_t* gate )
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;
gate->get_max64 = (void*)&scrypt_get_max64;
opt_target_factor = 65536.0;
if ( !opt_param_n )
{
opt_param_n = 1024;

View File

@@ -240,7 +240,6 @@ bool register_scryptjane_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_scryptjane;
gate->hash = (void*)&scryptjanehash;
gate->get_max64 = (void*)&get_max64_0x40LL;
opt_target_factor = 65536.0;
// figure out if arg in N or Nfactor

View File

@@ -305,9 +305,11 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
low = low << 3;
sc->buf[ pad >> 2 ] =
mm128_bswap_32( _mm_set1_epi32( high ) );
mm128_bswap_32( m128_const1_32( high ) );
// mm128_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm128_bswap_32( _mm_set1_epi32( low ) );
mm128_bswap_32( m128_const1_32( low ) );
// mm128_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -538,9 +540,9 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
low = low << 3;
sc->buf[ pad >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( high ) );
mm256_bswap_32( m256_const1_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( low ) );
mm256_bswap_32( m256_const1_32( low ) );
sha256_8way_round( sc, sc->buf, sc->val );

View File

@@ -15,7 +15,6 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}
@@ -34,7 +33,6 @@ bool register_sha256q_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha256q;
gate->hash = (void*)&sha256q_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

View File

@@ -252,16 +252,6 @@ void sha512_4way_init( sha512_4way_context *sc )
{
sc->initialized = false;
sc->count = 0;
/*
sc->val[0] = _mm256_set1_epi64x( H512[0] );
sc->val[1] = _mm256_set1_epi64x( H512[1] );
sc->val[2] = _mm256_set1_epi64x( H512[2] );
sc->val[3] = _mm256_set1_epi64x( H512[3] );
sc->val[4] = _mm256_set1_epi64x( H512[4] );
sc->val[5] = _mm256_set1_epi64x( H512[5] );
sc->val[6] = _mm256_set1_epi64x( H512[6] );
sc->val[7] = _mm256_set1_epi64x( H512[7] );
*/
}
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
@@ -295,6 +285,8 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
0x0001020304050607 );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
@@ -308,10 +300,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
sha512_4way_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );

View File

@@ -5,6 +5,7 @@
#if defined(__AVX2__)
static const uint32_t IV512[] =
{
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
@@ -13,6 +14,7 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
mm256_ror1x32_128( b ), 0x88 )
@@ -232,18 +234,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx )
{
casti_m256i( ctx->h, 0 ) =
_mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0],
IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );
casti_m256i( ctx->h, 1 ) =
_mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4],
IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] );
casti_m256i( ctx->h, 2 ) =
_mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
casti_m256i( ctx->h, 3 ) =
_mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
IV512[15], IV512[14], IV512[13], IV512[12] );
__m256i *h = (__m256i*)ctx->h;
__m128i *iv = (__m128i*)IV512;
h[0] = m256_const1_128( iv[0] );
h[1] = m256_const1_128( iv[1] );
h[2] = m256_const1_128( iv[2] );
h[3] = m256_const1_128( iv[3] );
ctx->ptr = 0;
ctx->count0 = 0;
ctx->count1 = 0;
@@ -251,6 +249,7 @@ void shavite512_2way_init( shavite512_2way_context *ctx )
ctx->count3 = 0;
}
// not tested, use update_close
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
size_t len )
{
@@ -287,6 +286,7 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
ctx->ptr = ptr;
}
// not tested
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
{
unsigned char *buf;
@@ -300,7 +300,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
uint32_t vp = ctx->ptr>>5;
// Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
// Zero pad full vectors up to count
for ( ; vp < 6; vp++ )
@@ -314,14 +314,12 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
casti_m256i( buf, 6 ) = m256_const1_128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
c512_2way( ctx, buf);
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
@@ -382,23 +380,21 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
if ( vp == 0 ) // empty buf, xevan.
{
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
memset_zero_256( (__m256i*)buf + 1, 5 );
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
}
else // half full buf, everyone else.
{
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
}
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
casti_m256i( buf, 6 ) = m256_const1_128(
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
c512_2way( ctx, buf);

View File

@@ -110,14 +110,26 @@ static const m256_v16 FFT256_Twiddle[] =
// imported from vector.c
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
/*
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
_mm256_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
m256_const1_64( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) )
/*
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, \
_mm256_and_si256( _mm256_set1_epi16( 257 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
*/
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )

View File

@@ -2,8 +2,6 @@
#include "sph_skein.h"
#include "skein-hash-4way.h"
int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | SHA_OPT;
@@ -14,7 +12,6 @@ bool register_skein_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
gate->get_max64 = (void*)&skein_get_max64;
return true;
};

View File

@@ -2,11 +2,6 @@
#include <stdint.h>
#include "sph_skein.h"
int64_t skein2_get_max64 ()
{
return 0x7ffffLL;
}
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
@@ -17,7 +12,6 @@ bool register_skein2_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
gate->get_max64 = (void*)&skein2_get_max64;
return true;
};

View File

@@ -181,7 +181,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
for( j =0; j < 16; j++ )
{
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
mm128_rol_32( T, j ) ), 7 );
mm128_rol_var_32( T, j ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
SS2 ), W1[j] );
@@ -201,7 +201,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
for( j =16; j < 64; j++ )
{
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
mm128_rol_32( T, j&31 ) ), 7 );
mm128_rol_var_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
SS2 ), W1[j] );

View File

@@ -12,7 +12,6 @@ bool register_c11_algo( algo_gate_t* gate )
gate->hash = (void*)&c11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -125,7 +125,6 @@ bool register_fresh_algo( algo_gate_t* gate )
algo_not_tested();
gate->scanhash = (void*)&scanhash_fresh;
gate->hash = (void*)&freshhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
opt_target_factor = 256.0;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_timetravel_algo( algo_gate_t* gate )
gate->hash = (void*)&timetravel_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_timetravel10_algo( algo_gate_t* gate )
gate->hash = (void*)&timetravel10_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -3,7 +3,6 @@
bool register_tribus_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
// init_tribus_4way_ctx();
gate->scanhash = (void*)&scanhash_tribus_4way;

View File

@@ -12,7 +12,6 @@ bool register_x11_algo( algo_gate_t* gate )
gate->hash = (void*)&x11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_x11gost_algo( algo_gate_t* gate )
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_x12_algo( algo_gate_t* gate )
gate->hash = (void*)&x12hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_phi1612_algo( algo_gate_t* gate )
gate->hash = (void*)&phi1612_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_x13_algo( algo_gate_t* gate )
gate->hash = (void*)&x13hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
//#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
@@ -28,7 +27,6 @@ typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
// luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
@@ -49,7 +47,6 @@ void init_x13bcd_4way_ctx()
skein512_4way_init( &x13bcd_4way_ctx.skein );
jh512_4way_init( &x13bcd_4way_ctx.jh );
keccak512_4way_init( &x13bcd_4way_ctx.keccak );
// luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 );
cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13bcd_4way_ctx.shavite );
simd_2way_init( &x13bcd_4way_ctx.simd, 512 );
@@ -72,8 +69,6 @@ void x13bcd_4way_hash( void *state, const void *input )
// Blake
memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 );
// blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Bmw
@@ -127,17 +122,6 @@ void x13bcd_4way_hash( void *state, const void *input )
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
/*
// Luffa
intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
*/
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
@@ -185,26 +169,6 @@ void x13bcd_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
/*
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
memset( sm3_hash1, 0, sizeof sm3_hash1 );
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
memset( sm3_hash2, 0, sizeof sm3_hash2 );
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
*/
// Hamsi parallel 4x32x2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );

View File

@@ -12,7 +12,6 @@ bool register_x13sm3_algo( algo_gate_t* gate )
gate->hash = (void*)&x13sm3_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
@@ -28,7 +27,6 @@ bool register_x13bcd_algo( algo_gate_t* gate )
gate->hash = (void*)&x13bcd_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -83,6 +83,5 @@ bool register_axiom_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_axiom;
gate->hash = (void*)&axiomhash;
gate->get_max64 = (void*)&get_max64_0x40LL;
return true;
}

View File

@@ -11,7 +11,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_polytimos;
gate->hash = (void*)&polytimos_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_veltor_algo( algo_gate_t* gate )
gate->hash = (void*)&veltor_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -12,7 +12,6 @@ bool register_x14_algo( algo_gate_t* gate )
gate->hash = (void*)&x14hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -275,34 +275,31 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do
{
@@ -312,14 +309,15 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
x16r_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
for ( int i = 0; i < 4; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !(*restart) );
} while ( likely( ( n < max_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce + 1;
return 0;

View File

@@ -24,7 +24,6 @@
#include "algo/sha/sha-hash-4way.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread bool s_implemented = false;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rt_4way_context_overlay
@@ -64,26 +63,8 @@ void x16rt_4way_hash( void* output, const void* input )
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
/*
void *in = (void*) input;
uint32_t *in32 = (uint32_t*) hash0;
uint32_t ntime = in32[17];
if ( s_ntime == UINT32_MAX )
{
uint32_t _ALIGN(64) timeHash[8];
x16rt_getTimeHash(ntime, &timeHash);
x16rt_getAlgoString(&timeHash[0], hashOrder);
}
*/
// Input data is both 64 bit interleaved (input)
// and deinterleaved in inp0-3.
// If First function uses 64 bit data it is not required to interleave inp
// first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
// All other functions assume data is deinterleaved in hash0-3
// All functions must exit with data deinterleaved in hash0-3.
// Alias in0-3 points to either inp0-3 or hash0-3 according to
// its hashOrder position. Size is also set accordingly.
// and deinterleaved in inp0-3. First function has no need re-interleave.
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
@@ -290,44 +271,31 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint32_t ntime = swab32( pdata[17] );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], hashOrder );
s_ntime = ntime;
s_implemented = true;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
hashOrder, ntime, timeHash );
}
if ( !s_implemented )
{
applog( LOG_WARNING, "s not implemented");
sleep(1);
return 0;
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{

View File

@@ -331,35 +331,32 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( opt_benchmark )
ptarget[7] = 0x0fff;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
if ( s_ntime != endiandata[17] )
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -368,14 +365,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
x16rv2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
for ( int i = 0; i < 4; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !(*restart) );
} while ( likely( ( n < max_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce + 1;
return 0;

View File

@@ -368,7 +368,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -378,25 +378,22 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
if ( s_ntime != endiandata[17] )
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(

View File

@@ -803,52 +803,40 @@ void sonoa_4way_hash( void *state, const void *input )
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
int scanhash_sonoa_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *hash7 = &( hash[7<<2] );
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const int thr_id = mythr->id;
// Need big endian data
mm256_bswap32_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
do
{
uint32_t mask = masks[m];
do
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
sonoa_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash7[ lane ] <= Htarg ) )
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
sonoa_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
}
n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -11,7 +11,6 @@ bool register_sonoa_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sonoa;
gate->hash = (void*)&sonoa_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x1ffff;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

View File

@@ -205,50 +205,40 @@ void x17_4way_hash( void *state, const void *input )
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
// Need big endian data
mm256_bswap32_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
const uint32_t mask = masks[ m ];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_4way_hash( hash, vdata );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
for ( int lane = 0; lane < 4; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1;
return 0;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -12,7 +12,6 @@ bool register_xevan_algo( algo_gate_t* gate )
gate->hash = (void*)&xevan_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -416,16 +416,6 @@ int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t yescrypt_get_max64()
{
return 0x1ffLL;
}
int64_t yescryptr16_get_max64()
{
return 0xfffLL;
}
void yescrypt_gate_base(algo_gate_t *gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
@@ -437,7 +427,6 @@ void yescrypt_gate_base(algo_gate_t *gate )
bool register_yescrypt_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
if ( opt_param_n ) YESCRYPT_N = opt_param_n;
else YESCRYPT_N = 2048;
@@ -469,7 +458,6 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr8_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 2048;
@@ -481,7 +469,6 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescryptr16_get_max64;
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;
@@ -493,7 +480,6 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
bool register_yescryptr32_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescryptr16_get_max64;
yescrypt_client_key = "WaviBanana";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;

View File

@@ -0,0 +1,322 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <algo/yespower/crypto/sph_types.h>
#include <algo/yespower/utils/sysendian.h>
#include "blake2b-yp.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
// init work variables
for (i = 0; i < 8; i++) {
v[i] = ctx->h[i];
v[i + 8] = blake2b_iv[i];
}
v[12] ^= ctx->t[0]; // low 64 bits of offset
v[13] ^= ctx->t[1]; // high 64 bits
// last block flag set ?
if (last) {
v[14] = ~v[14];
}
// get little-endian words
for (i = 0; i < 16; i++) {
m[i] = B2B_GET64(&ctx->b[8 * i]);
}
// twelve rounds
for (i = 0; i < 12; i++) {
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for(i = 0; i < 8; ++i) {
ctx->h[i] ^= v[i] ^ v[i + 8];
}
}
// Initialize the hashing context "ctx" with optional key "key".
// 1 <= outlen <= 64 gives the digest size in bytes.
// Secret key (also <= 64 bytes) is optional (keylen = 0).
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
const void *key, size_t keylen) // (keylen=0: no key)
{
size_t i;
// illegal parameters
if (outlen == 0 || outlen > 64 || keylen > 64) {
return -1;
}
// state, "param block"
for (i = 0; i < 8; i++) {
ctx->h[i] = blake2b_iv[i];
}
ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
ctx->t[0] = 0; // input count low word
ctx->t[1] = 0; // input count high word
ctx->c = 0; // pointer within buffer
ctx->outlen = outlen;
// zero input block
for (i = keylen; i < 128; i++) {
ctx->b[i] = 0;
}
if (keylen > 0) {
blake2b_yp_update(ctx, key, keylen);
ctx->c = 128; // at the end
}
return 0;
}
// Add "inlen" bytes from "in" into the hash.
void blake2b_yp_update(blake2b_yp_ctx *ctx,
const void *in, size_t inlen) // data bytes
{
size_t i;
for (i = 0; i < inlen; i++) {
if (ctx->c == 128) { // buffer full ?
ctx->t[0] += ctx->c; // add counters
if (ctx->t[0] < ctx->c) // carry overflow ?
ctx->t[1]++; // high word
blake2b_compress(ctx, 0); // compress (not last)
ctx->c = 0; // counter to zero
}
ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
}
}
// Generate the message digest (size given in init).
// Result placed in "out".
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
{
size_t i;
ctx->t[0] += ctx->c; // mark last block offset
// carry overflow
if (ctx->t[0] < ctx->c) {
ctx->t[1]++; // high word
}
// fill up with zeros
while (ctx->c < 128) {
ctx->b[ctx->c++] = 0;
}
blake2b_compress(ctx, 1); // final block flag = 1
// little endian convert and store
for (i = 0; i < ctx->outlen; i++) {
((uint8_t *) out)[i] =
(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
}
}
// inlen = number of bytes
void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
blake2b_yp_ctx ctx;
blake2b_yp_init(&ctx, 32, NULL, 0);
blake2b_yp_update(&ctx, in, inlen);
blake2b_yp_final(&ctx, out);
}
// // keylen = number of bytes
void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64) {
blake2b_yp_hash(keyhash, key, keylen);
key = keyhash;
keylen = 32;
}
blake2b_yp_init(&hctx->inner, 32, NULL, 0);
memset(pad, 0x36, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->inner, pad, 64);
blake2b_yp_init(&hctx->outer, 32, NULL, 0);
memset(pad, 0x5c, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->outer, pad, 64);
memset(keyhash, 0, 32);
}
// datalen = number of bits
void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
// update the inner state
blake2b_yp_update(&hctx->inner, data, datalen);
}
void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
uint8_t ihash[32];
blake2b_yp_final(&hctx->inner, ihash);
blake2b_yp_update(&hctx->outer, ihash, 32);
blake2b_yp_final(&hctx->outer, digest);
memset(ihash, 0, 32);
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
hmac_yp_ctx hctx;
hmac_blake2b_yp_init(&hctx, key, keylen);
hmac_blake2b_yp_update(&hctx, in, inlen);
hmac_blake2b_yp_final(&hctx, out);
}
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
hmac_yp_ctx PShctx, hctx;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
hmac_blake2b_yp_update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
hmac_blake2b_yp_update(&hctx, ivec, 4);
hmac_blake2b_yp_final(&hctx, U);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
hmac_blake2b_yp_update(&hctx, U, 32);
hmac_blake2b_yp_final(&hctx, U);
/* ... xor U_j ... */
for (k = 0; k < 32; k++) {
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32) {
clen = 32;
}
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
memset(&PShctx, 0, sizeof(hmac_yp_ctx));
}

View File

@@ -0,0 +1,42 @@
#pragma once
#ifndef __BLAKE2B_H__
#define __BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
// state context
typedef struct {
uint8_t b[128]; // input buffer
uint64_t h[8]; // chained state
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_yp_ctx;
typedef struct {
blake2b_yp_ctx inner;
blake2b_yp_ctx outer;
} hmac_yp_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
void blake2b_yp_hash(void *out, const void *in, size_t inlen);
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
#if defined(__cplusplus)
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
#define insecure_memzero(buf, len) /* empty */

View File

@@ -0,0 +1,94 @@
/*-
* Copyright 2007-2014 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _SYSENDIAN_H_
#define _SYSENDIAN_H_
#include <stdint.h>
/* Avoid namespace collisions with BSD <sys/endian.h>. */
#define be32dec libcperciva_be32dec
#define be32enc libcperciva_be32enc
#define be64enc libcperciva_be64enc
#define le32dec libcperciva_le32dec
#define le32enc libcperciva_le32enc
static inline uint32_t
be32dec(const void * pp)
{
const uint8_t * p = (uint8_t const *)pp;
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
}
static inline void
be32enc(void * pp, uint32_t x)
{
uint8_t * p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
static inline void
be64enc(void * pp, uint64_t x)
{
uint8_t * p = (uint8_t *)pp;
p[7] = x & 0xff;
p[6] = (x >> 8) & 0xff;
p[5] = (x >> 16) & 0xff;
p[4] = (x >> 24) & 0xff;
p[3] = (x >> 32) & 0xff;
p[2] = (x >> 40) & 0xff;
p[1] = (x >> 48) & 0xff;
p[0] = (x >> 56) & 0xff;
}
static inline uint32_t
le32dec(const void * pp)
{
const uint8_t * p = (uint8_t const *)pp;
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
static inline void
le32enc(void * pp, uint32_t x)
{
uint8_t * p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
}
#endif /* !_SYSENDIAN_H_ */

File diff suppressed because it is too large Load Diff

View File

@@ -70,9 +70,43 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
return 0;
}
int64_t yespower_get_max64()
void yespower_b2b_hash( const char *input, char *output, uint32_t len )
{
return 0xfffLL;
yespower_b2b_tls( input, len, &yespower_params, (yespower_binary_t*)output );
}
int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
for (int k = 0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
do {
be32enc(&endiandata[19], n);
yespower_b2b_hash((char*) endiandata, (char*) vhash, 80);
if ( vhash[7] < Htarg && fulltest( vhash, ptarget )
&& !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, vhash, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_yespower_algo( algo_gate_t* gate )
@@ -102,7 +136,6 @@ bool register_yespower_algo( algo_gate_t* gate )
applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );
gate->optimizations = SSE2_OPT;
gate->get_max64 = (void*)&yespower_get_max64;
gate->scanhash = (void*)&scanhash_yespower;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
@@ -117,7 +150,6 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
yespower_params.pers = NULL;
yespower_params.perslen = 0;
gate->optimizations = SSE2_OPT;
gate->get_max64 = (void*)&yespower_get_max64;
gate->scanhash = (void*)&scanhash_yespower;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
@@ -125,21 +157,10 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
};
int64_t yescrypt_05_get_max64()
{
return 0x1ffLL;
}
int64_t yescryptr16_05_get_max64()
{
return 0xfffLL;
}
bool register_yescrypt_05_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->get_max64 = (void*)&yescrypt_05_get_max64;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 2048;
yespower_params.r = 8;
@@ -153,7 +174,6 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->get_max64 = (void*)&yescrypt_05_get_max64;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 2048;
yespower_params.r = 8;
@@ -167,7 +187,6 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->get_max64 = (void*)&yescryptr16_05_get_max64;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 16;
@@ -181,7 +200,6 @@ bool register_yescryptr32_05_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->get_max64 = (void*)&yescryptr16_05_get_max64;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 32;
@@ -191,3 +209,64 @@ bool register_yescryptr32_05_algo( algo_gate_t* gate )
return true;
}
bool register_power2b_algo( algo_gate_t* gate )
{
yespower_params.version = YESPOWER_1_0;
yespower_params.N = 2048;
yespower_params.r = 32;
yespower_params.pers = "Now I am become Death, the destroyer of worlds";
yespower_params.perslen = 46;
applog( LOG_NOTICE,"yespower-b2b parameters: N= %d, R= %d.", yespower_params.N,
yespower_params.r );
applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_yespower_b2b;
gate->hash = (void*)&yespower_b2b_hash;
opt_target_factor = 65536.0;
return true;
};
// Generic yespower + blake2b
bool register_yespower_b2b_algo( algo_gate_t* gate )
{
yespower_params.version = YESPOWER_1_0;
if ( !( opt_param_n && opt_param_r ) )
{
applog(LOG_ERR,"Yespower-b2b N & R parameters are required");
return false;
}
yespower_params.N = opt_param_n;
yespower_params.r = opt_param_r;
if ( opt_param_key )
{
yespower_params.pers = opt_param_key;
yespower_params.perslen = strlen( opt_param_key );
}
else
{
yespower_params.pers = NULL;
yespower_params.perslen = 0;
}
applog( LOG_NOTICE,"Yespower-b2b parameters: N= %d, R= %d",
yespower_params.N, yespower_params.r );
if ( yespower_params.pers )
{
applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
}
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_yespower_b2b;
gate->hash = (void*)&yespower_b2b_hash;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -111,6 +111,10 @@ extern int yespower(yespower_local_t *local,
const uint8_t *src, size_t srclen,
const yespower_params_t *params, yespower_binary_t *dst);
extern int yespower_b2b(yespower_local_t *local,
const uint8_t *src, size_t srclen,
const yespower_params_t *params, yespower_binary_t *dst);
/**
* yespower_tls(src, srclen, params, dst):
* Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
@@ -123,6 +127,9 @@ extern int yespower(yespower_local_t *local,
extern int yespower_tls(const uint8_t *src, size_t srclen,
const yespower_params_t *params, yespower_binary_t *dst);
extern int yespower_b2b_tls(const uint8_t *src, size_t srclen,
const yespower_params_t *params, yespower_binary_t *dst);
#ifdef __cplusplus
}
#endif

4
api.c
View File

@@ -32,7 +32,7 @@
#include <sys/types.h>
#include "miner.h"
#include "sysinfos.c"
#ifndef WIN32
# include <errno.h>
# include <sys/socket.h>
@@ -105,7 +105,7 @@ extern double global_hashrate;
#define USE_MONITORING
extern float cpu_temp(int);
extern uint32_t cpu_clock(int);
extern int cpu_fanpercent(void);
//extern int cpu_fanpercent(void);
/***************************************************************/

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.8.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.10.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.8'
PACKAGE_STRING='cpuminer-opt 3.9.8'
PACKAGE_VERSION='3.9.10'
PACKAGE_STRING='cpuminer-opt 3.9.10'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.8 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.8:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.10:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.8
cpuminer-opt configure 3.9.10
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.8, which was
It was created by cpuminer-opt $as_me 3.9.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.8'
VERSION='3.9.10'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.8, which was
This file was extended by cpuminer-opt $as_me 3.9.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.8
cpuminer-opt config.status 3.9.10
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.8])
AC_INIT([cpuminer-opt], [3.9.10])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

File diff suppressed because it is too large Load Diff

24
miner.h
View File

@@ -352,6 +352,7 @@ bool submit_lane_solution( struct work *work, void *hash,
void get_currentalgo( char* buf, int sz );
/*
bool has_sha();
bool has_aes_ni();
bool has_avx1();
@@ -368,6 +369,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz);
void cpu_brand_string( char* s );
float cpu_temp( int core );
*/
struct work {
uint32_t data[48] __attribute__ ((aligned (64)));
@@ -439,7 +441,7 @@ struct stratum_ctx {
struct work work __attribute__ ((aligned (64)));
pthread_mutex_t work_lock;
int bloc_height;
int block_height;
} __attribute__ ((aligned (64)));
bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
@@ -490,6 +492,8 @@ void print_hash_tests(void);
void scale_hash_for_display ( double* hashrate, char* units );
void report_summary_log( bool force );
/*
struct thr_info {
int id;
@@ -570,6 +574,7 @@ enum algos {
ALGO_PHI2,
ALGO_PLUCK,
ALGO_POLYTIMOS,
ALGO_POWER2B,
ALGO_QUARK,
ALGO_QUBIT,
ALGO_SCRYPT,
@@ -612,6 +617,7 @@ enum algos {
ALGO_YESCRYPTR32,
ALGO_YESPOWER,
ALGO_YESPOWERR16,
ALGO_YESPOWER_B2B,
ALGO_ZR5,
ALGO_COUNT
};
@@ -665,6 +671,7 @@ static const char* const algo_names[] = {
"phi2",
"pluck",
"polytimos",
"power2b",
"quark",
"qubit",
"scrypt",
@@ -707,6 +714,7 @@ static const char* const algo_names[] = {
"yescryptr32",
"yespower",
"yespowerr16",
"yespower-b2b",
"zr5",
"\0"
};
@@ -718,7 +726,6 @@ extern bool opt_debug;
extern bool opt_debug_diff;
extern bool opt_benchmark;
extern bool opt_protocol;
extern bool opt_showdiff;
extern bool opt_extranonce;
extern bool opt_quiet;
extern bool opt_redirect;
@@ -749,6 +756,7 @@ extern uint32_t opt_work_size;
extern double *thr_hashrates;
extern double global_hashrate;
extern double stratum_diff;
extern bool opt_reset_on_stale;
extern double net_diff;
extern double net_hashrate;
extern int opt_pluck_n;
@@ -823,9 +831,10 @@ Options:\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\
phi1612 phi\n\
phi2 Luxcoin (LUX)\n\
phi2\n\
pluck Pluck:128 (Supcoin)\n\
polytimos\n\
power2b MicroBitcoin (MBC)\n\
quark Quark\n\
qubit Qubit\n\
scrypt scrypt(1024, 1, 1) (default)\n\
@@ -859,7 +868,7 @@ Options:\n\
x16rv2 Ravencoin (RVN)\n\
x16rt Gincoin (GIN)\n\
x16rt-veil Veil (VEIL)\n\
x16s Pigeoncoin (PGN)\n\
x16s\n\
x17\n\
x21s\n\
xevan Bitsend (BSD)\n\
@@ -869,6 +878,7 @@ Options:\n\
yescryptr32 WAVI\n\
yespower Cryply\n\
yespowerr16 Yenten (YTN)\n\
yespower-b2b generic yespower + blake2b\n\
zr5 Ziftr\n\
-N, --param-n N parameter for scrypt based algos\n\
-R, --patam-r R parameter for scrypt based algos\n\
@@ -888,10 +898,10 @@ Options:\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\
long polling is unavailable, in seconds (default: 5)\n\
--randomize Randomize scan range start to reduce duplicates\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
--reset-on-stale Workaround reset stratum if too many stale shares\n\
-f, --diff-factor Divide req. difficulty by this factor (std is 1.0)\n\
-m, --diff-multiplier Multiply difficulty by this factor (std is 1.0)\n\
--hash-meter Display thread hash rates\n\
--hide-diff Do not display changes in difficulty\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--coinbase-sig=TEXT data to insert in the coinbase when possible\n\
--no-longpoll disable long polling support\n\
@@ -955,7 +965,6 @@ static struct option const options[] = {
{ "diff", 1, NULL, 'f' }, // deprecated (alias)
{ "diff-multiplier", 1, NULL, 'm' },
{ "hash-meter", 0, NULL, 1014 },
{ "hide-diff", 0, NULL, 1013 },
{ "help", 0, NULL, 'h' },
{ "key", 1, NULL, 'K' },
{ "no-gbt", 0, NULL, 1011 },
@@ -978,6 +987,7 @@ static struct option const options[] = {
{ "retries", 1, NULL, 'r' },
{ "retry-pause", 1, NULL, 1025 },
{ "randomize", 0, NULL, 1024 },
{ "reset-on-stale", 0, NULL, 1026 },
{ "scantime", 1, NULL, 's' },
#ifdef HAVE_SYSLOG_H
{ "syslog", 0, NULL, 'S' },

View File

@@ -36,7 +36,7 @@
// MMX: 64 bit vectors
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (still under development)
// AVX512: 512 bit vectors (Starting with SkylakeX)
//
// Most functions are avalaible at the stated levels but in rare cases
// a higher level feature may be required with no compatible alternative.
@@ -105,59 +105,36 @@
// Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
// right by 64 bits.
//
// Some random thoughts about macros and inline functions, the pros and
// cons, when to use them, etc:
// Vector constants
//
// Macros are very convenient and efficient for statement functions.
// Macro args are passed by value and modifications are seen by the caller.
// Macros should not generally call regular functions unless it is for a
// special purpose such overloading a function name.
// Statement function macros that return a value should not end in ";"
// Statement function macros that return a value and don't modify input args
// may be used in function arguments and expressions.
// Macro args used in expressions should be protected ex: (x)+1
// Macros force inlining, function inlining can be overridden by the compiler.
// Inline functions are preferred when multiple statements or local variables
// are needed.
// The compiler can't do any syntax checking or type checking of args making
// macros difficult to debug.
// Although it is technically posssible to access the callers data without
// they being passed as arguments it is good practice to always define
// arguments even if they have the same name.
// Vector constants are a big problem because they technically don't exist.
// All vectors used as constants either reside in memory or must be genererated
// at run time at significant cost. The cost of generating a constant
// increases non-linearly with the number of vector elements. A 4 element
// vector costs between 7 and 11 clocks to generate, an 8 element vector
// is 15-25 clocks. There are also additional clock due to data dependency
// stalls.
//
// General guidelines for inline functions:
// Vector constants are often used as control indexes for permute, blend, etc,
// where generating the index can be over 90% of the operation. This is
// where the problem occurs. An instruction that only requires one to 3
// clocks needs may times more just to build the index argument.
//
// Inline functions should not have loops, it defeats the purpose of inlining.
// Inline functions should be short, the benefit is lost and the memory cost
// increases if the function is referenced often.
// Inline functions may call other functions, inlined or not. It is convenient
// for wrapper functions whether or not the wrapped function is itself inlined.
// Care should be taken when unrolling loops that contain calls to inlined
// functions that may be large.
// Large code blocks used only once may use function inlining to
// improve high level code readability without the penalty of function
// overhead.
// There is very little a programmer can do to avoid the worst case scenarios.
// Smaller integers can be merged to form 64 bit integers, and vectors with
// repeated elements can be generated more efficiently but they have limited
// benefit and limited application.
//
// A major restructuring is taking place shifting the focus from pointers
// to registers. Previously pointer casting used memory to provide transparency
// leaving it up to the compiler to manage everything and it does a very good
// job. The focus has shifted to register arguments for more control
// over the actual instructions assuming the data is in a register and the
// the compiler just needs to manage the registers.
// If a vector constant is to be used repeatedly it is better to define a local
// variable to generate the constant only once.
//
// Rather than use pointers to provide type transparency
// specific instructions are used to access specific data as specific types.
// Previously pointers were cast and the compiler was left to find a way
// to get the data from wherever it happened to be to the correct registers.
// If a sequence of constants is to be used it can be more efficient to
// use arithmetic with already existing constants to generate new ones.
//
// The utilities defined here make use features like register aliasing
// to optimize operations. Many operations have specialized versions as
// well as more generic versions. It is preferable to use a specialized
// version whenever possible a sthey can take advantage of certain
// optimizations not available to the generic version. Specically the generic
// version usually has a second argument used is some extra calculations.
//
///////////////////////////////////////////////////////
// ex: const __m512i one = _mm512_const1_64( 1 );
// const __m512i two = _mm512_add_epi64( one, one );
//
//////////////////////////////////////////////////////////////////////////
#include <inttypes.h>
#include <x86intrin.h>
@@ -165,9 +142,6 @@
#include <stdlib.h>
#include <stdbool.h>
// Various types and overlays
#include "simd-utils/simd-types.h"
// 64 and 128 bit integers.
#include "simd-utils/simd-int.h"
@@ -191,16 +165,16 @@
// Utilities that require AVX2 are defined in simd-256.h.
// Skylake-X has all these
#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// 512 bit vectors
#include "simd-utils/simd-512.h"
#endif // MMX
#endif // SSE2
#endif // AVX
#endif // AVX2
#endif // AVX512
#endif // AVX2
#endif // AVX
#endif // SSE2
#endif // MMX
#include "simd-utils/intrlv.h"

File diff suppressed because it is too large Load Diff

View File

@@ -3,196 +3,132 @@
#if defined(__SSE2__)
//////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
//
// 128 bit SSE vectors
//
// SSE2 is generally required for full 128 bit support. Some functions
// are also optimized with SSSE3 or SSE4.1.
//
// Do not call intrinsic _mm_extract directly, it isn't supported in SSE2.
// Use mm128_extr macro instead, it will select the appropriate implementation.
//
// 128 bit operations are enhanced with uint128 which adds 128 bit integer
// support for arithmetic and other operations. Casting to uint128_t is not
// free but is sometimes the only way for certain operations.
// SSE2 is required for 128 bit integer support. Some functions are also
// optimized with SSSE3, SSE4.1 or AVX. Some of these more optimized
// functions don't have SSE2 equivalents and their use would break SSE2
// compatibility.
//
// Constants are an issue with simd. Simply put, immediate constants don't
// exist. All simd constants either reside in memory or a register and
// must be loaded or generated at run time.
// must be loaded from memory or generated at run time.
//
// Due to the cost of generating constants it is often more efficient to
// Due to the cost of generating constants it is more efficient to
// define a local const for repeated references to the same constant.
//
// Some constant values can be generated using shortcuts. Zero for example
// is as simple as XORing any register with itself, and is implemented
// in the setzero instrinsic. These shortcuts must be implemented using ASM
// due to doing things the compiler would complain about. Another single
// instruction constant is -1, defined below. Others may be added as the need
// arises. Even single instruction constants are less efficient than local
// register variables so the advice above stands.
// One common use for simd constants is as a control index for vector
// instructions like blend and shuffle. Alhough the ultimate instruction
// may execute in a single clock cycle, generating the control index adds
// several more cycles to the entire operation.
//
// One common use for simd constants is as a control index for some simd
// instructions like blend and shuffle. The utilities below do not take this
// into account. Those that generate a simd constant should not be used
// repeatedly. It may be better for the application to reimplement the
// utility to better suit its usage.
// All of the utilities here assume all data is in registers except
// in rare cases where arguments are pointers.
//
// Intrinsics automatically promote from REX to VEX when AVX is available
// but ASM needs to be done manually.
//
///////////////////////////////////////////////////////////////////////////
// Efficient and convenient moving bwtween GP & low bits of XMM.
// Use VEX when available to give access to xmm8-15 and zero extend for
// larger vectors.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
#if defined(__AVX__)
asm( "vmovq %1, %0\n\t" : "=x"(a) : "r"(n) );
#else
asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
return a;
}
static inline __m128i mm128_mov32_128( const uint32_t n )
{
__m128i a;
#if defined(__AVX__)
asm( "vmovd %1, %0\n\t" : "=x"(a) : "r"(n) );
#else
asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
return a;
}
static inline uint64_t mm128_mov128_64( const __m128i a )
{
uint64_t n;
#if defined(__AVX__)
asm( "vmovq %1, %0\n\t" : "=r"(n) : "x"(a) );
#else
asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
#endif
return n;
}
static inline uint32_t mm128_mov128_32( const __m128i a )
{
uint32_t n;
#if defined(__AVX__)
asm( "vmovd %1, %0\n\t" : "=r"(n) : "x"(a) );
#else
asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
#endif
return n;
}
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
#define m128_one_16 _mm_shuffle_epi32( \
mm128_mov32_128( 0x00010001 ), 0x00 )
#define m128_one_8 _mm_shuffle_epi32( \
mm128_mov32_128( 0x01010101 ), 0x00 )
static inline __m128i m128_one_128_fn()
{
register uint64_t one = 1;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return a;
}
#define m128_one_128 m128_one_128_fn()
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
static inline __m128i m128_one_64_fn()
{
register uint64_t one = 1;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x44 );
}
#define m128_one_64 m128_one_64_fn()
static inline __m128i m128_one_32_fn()
{
register uint32_t one = 1;
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_32 m128_one_32_fn()
static inline __m128i m128_one_16_fn()
{
register uint32_t one = 0x00010001;
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_16 m128_one_16_fn()
static inline __m128i m128_one_8_fn()
{
register uint32_t one = 0x01010101;
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_8 m128_one_8_fn()
static inline __m128i m128_neg1_fn()
static inline __m128i mm128_neg1_fn()
{
__m128i a;
asm( "pcmpeqd %0, %0\n\t"
: "=x" (a) );
return a;
}
#define m128_neg1 m128_neg1_fn()
// move uint64_t to low bits of __m128i, zeros the rest
static inline __m128i mm128_mov64_128( uint64_t n )
{
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return a;
}
static inline __m128i mm128_mov32_128( uint32_t n )
{
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return a;
}
static inline uint64_t mm128_mov128_64( __m128i a )
{
register uint64_t n;
asm( "movq %1, %0\n\t"
: "=x" (n)
: "r" (a) );
return n;
}
static inline uint32_t mm128_mov128_32( __m128i a )
{
register uint32_t n;
asm( "movd %1, %0\n\t"
: "=x" (n)
: "r" (a) );
return n;
}
static inline __m128i m128_const1_64( const uint64_t n )
{
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm_shuffle_epi32( a, 0x44 );
}
static inline __m128i m128_const1_32( const uint32_t n )
{
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm_shuffle_epi32( a, 0x00 );
}
#if defined(__SSE41__)
// alternative to _mm_set_epi64x, doesn't use mem,
static inline __m128i m128_const_64( const uint64_t hi, const uint64_t lo )
{
register __m128i a;
asm( "movq %2, %0\n\t"
"pinsrq $1, %1, %0\n\t"
: "=x" (a)
: "r" (hi), "r" (lo) );
return a;
}
/*
static inline __m128i m128_const1_64( const uint64_t n )
{
register __m128i a;
asm( "movq %1, %0\n\t"
"pinsrq $1, %1, %0\n\t"
: "=x"(a)
: "r"(n) );
return a;
}
*/
#if defined(__AVX__)
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
#else
asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
#endif
return a;
}
#define m128_neg1 mm128_neg1_fn()
// #define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL )
// const functions work best when arguments are immediate constants or
// are known to be in registers. If data needs to loaded from memory or cache
// use set.
// Equivalent of set1, broadcast 64 bit integer to all elements.
#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
#if defined(__SSE4_1__)
// Assign 64 bit integers to respective elements: {hi, lo}
#define m128_const_64( hi, lo ) \
_mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
#else // No insert in SSE2
#define m128_const_64 _mm_set_epi64x
// #define m128_const1_64 _mm_set1_epi64x
#endif
//
// Basic operations without equivalent SIMD intrinsic
@@ -220,18 +156,9 @@ static inline __m128i m128_const1_64( const uint64_t n )
#define mm128_xor4( a, b, c, d ) \
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
// This isn't cheap, not suitable for bulk usage.
#define mm128_extr_4x32( a0, a1, a2, a3, src ) \
do { \
a0 = _mm_extract_epi32( src, 0 ); \
a1 = _mm_extract_epi32( src, 1 ); \
a1 = _mm_extract_epi32( src, 2 ); \
a3 = _mm_extract_epi32( src, 3 ); \
} while(0)
// Horizontal vector testing
#if defined(__SSE41__)
#if defined(__SSE4_1__)
#define mm128_allbits0( a ) _mm_testz_si128( a, a )
#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 )
@@ -248,7 +175,7 @@ do { \
#define mm128_allbits0( a ) ( !mm128_anybits1(a) )
#define mm128_allbits1( a ) ( !mm128_anybits0(a) )
#endif // SSE41 else SSE2
#endif // SSE4.1 else SSE2
//
// Vector pointer cast
@@ -269,20 +196,6 @@ do { \
// returns pointer p+o
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
// SSE2 doesn't implement extract
#if defined(__SSE4_1)
#define mm128_extr_64(a,n) _mm_extract_epi64( a, n )
#define mm128_extr_32(a,n) _mm_extract_epi32( a, n )
#else
// Doesn't work with register variables.
#define mm128_extr_64(a,n) (((uint64_t*)&a)[n])
#define mm128_extr_32(a,n) (((uint32_t*)&a)[n])
#endif
// Memory functions
// Mostly for convenience, avoids calculating bytes.
@@ -307,24 +220,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Bit rotations
// AVX512 has implemented bit rotation for 128 bit vectors with
// AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements.
//
// Rotate each element of v by c bits
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
// necessary.
// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
#define mm128_ror_64( v, c ) \
#define mm128_ror_var_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_64( v, c ) \
#define mm128_rol_var_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_32( v, c ) \
#define mm128_ror_var_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_32( v, c ) \
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
/*
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
#else
*/
#define mm128_ror_64 mm128_ror_var_64
#define mm128_rol_64 mm128_rol_var_64
#define mm128_ror_32 mm128_ror_var_32
#define mm128_rol_32 mm128_rol_var_32
//#endif // AVX512 else
#define mm128_ror_16( v, c ) \
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
@@ -365,21 +300,38 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_brol( v, c ) \
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
#if defined(__SSSE3__)
#define mm128_invert_16( v ) \
_mm_shuffle_epi8( v, mm128_const_64( 0x0100030205040706, \
0x09080b0a0d0c0f0e )
#define mm128_invert_8( v ) \
_mm_shuffle_epi8( v, mm128_const_64( 0x0001020304050607, \
0x08090a0b0c0d0e0f )
#endif // SSSE3
//
// Rotate elements within lanes.
// Equivalent to mm128_ror_64( v, 32 )
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
// Equivalent to mm128_ror_64( v, 16 )
#define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \
m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 )
#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \
m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 )
#define mm128_ror16_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
0x0100070605040302 )
// Equivalent to mm128_ror_32( v, 16 )
#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \
m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 )
#define mm128_rol16_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706 )
#define mm128_swap16_32( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
0x0504070601000302 )
//
// Endian byte swap.
@@ -394,8 +346,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define mm128_bswap_16( v ) _mm_shuffle_epi8( \
m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 )
#define mm128_bswap_16( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) do \

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_256_H__)
#define SIMD_256_H__ 1
#if defined(__AVX__)
#if defined(__AVX2__)
/////////////////////////////////////////////////////////////////////
//
@@ -14,120 +14,68 @@
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
// Move integer to low element of vector, other elements are set to zero.
#define mm256_mov64_256( n ) _mm256_castsi128_si256( mm128_mov64_128( n ) )
#define mm256_mov32_256( n ) _mm256_castsi128_si256( mm128_mov32_128( n ) )
#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) )
#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) )
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
#define m256_const1_128( v ) \
_mm256_broadcastsi128_si256( v )
// Equavalent of set, move 64 bit integer constants to respective 64 bit
// elements.
static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{
__m128i hi, lo;
lo = mm128_mov64_128( i0 );
hi = mm128_mov64_128( i2 );
lo = _mm_insert_epi64( lo, i1, 1 );
hi = _mm_insert_epi64( hi, i3, 1 );
return mm256_concat_128( hi, lo );
}
// Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all
// 128 bit lanes.
#define m256_const2_64( i1, i0 ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( \
m128_const_64( i1, i0 ) ), 0x44 )
// Equivalent of set1, broadcast integer constant to all elements.
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
//
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 \
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_zero, 1 )
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 mm256_mov64_256( 1 )
#define m256_one_128 \
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_one_128, 1 )
_mm256_permute4x64_epi64( _mm256_castsi128_si256( \
mm128_mov64_128( 1 ) ), 0x44 )
#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
// set instructions load memory resident constants, this avoids mem.
// cost 4 pinsert + 1 vinsert, estimate 8 clocks latency.
#define m256_const_64( i3, i2, i1, i0 ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \
m128_const_64( i3, i2 ), 1 )
static inline __m256i m256_const1_64( uint64_t i )
static inline __m256i mm256_neg1_fn()
{
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (i) );
return _mm256_broadcastq_epi64( a );
}
static inline __m256i m256_const1_32( uint32_t i )
{
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (i) );
return _mm256_broadcastd_epi32( a );
}
#if defined(__AVX2__)
// Don't call the frunction directly, use the macro to make appear like
// a constant identifier instead of a function.
// __m256i foo = m256_one_64;
static inline __m256i m256_one_64_fn()
{
register uint64_t one = 1;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_64 m256_one_64_fn()
static inline __m256i m256_one_32_fn()
{
register uint64_t one = 0x0000000100000001;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_32 m256_one_32_fn()
static inline __m256i m256_one_16_fn()
{
register uint64_t one = 0x0001000100010001;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_16 m256_one_16_fn()
static inline __m256i m256_one_8_fn()
{
register uint64_t one = 0x0101010101010101;
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_8 m256_one_8_fn()
static inline __m256i m256_neg1_fn()
{
register __m256i a;
asm( "vpcmpeqq %0, %0, %0\n\t"
: "=x"(a) );
__m256i a;
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
return a;
}
#define m256_neg1 m256_neg1_fn()
#else // AVX
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi64x( 0x0000000100000001ULL )
#define m256_one_16 _mm256_set1_epi64x( 0x0001000100010001ULL )
#define m256_one_8 _mm256_set1_epi64x( 0x0101010101010101ULL )
// AVX doesn't have inserti128 but insertf128 will do.
static inline __m256i m256_neg1_fn()
{
__m128i a = m128_neg1;
return _mm256_insertf128_si256( _mm256_castsi128_si256( a ), a, 1 );
}
#define m256_neg1 m256_neg1_fn()
#endif // AVX2 else AVX
#define m256_neg1 mm256_neg1_fn()
//
@@ -146,58 +94,32 @@ static inline __m256i m256_neg1_fn()
#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
// Extract integers from 256 bit vector, ineficient, avoid if possible..
#define mm256_extr_4x64( a0, a1, a2, a3, src ) \
#define mm256_extr_4x64( a3, a2, a1, a0, src ) \
do { \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = mm256_mov256_64( src ); \
a0 = mm128_mov128_64( _mm256_castsi256_si128( src) ); \
a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
a2 = mm128_mov128_64( hi ); \
a3 = _mm_extract_epi64( hi, 1 ); \
} while(0)
#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \
#define mm256_extr_8x32( a7, a6, a5, a4, a3, a2, a1, a0, src ) \
do { \
uint64_t t = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = mm256_mov256_32( src ); \
a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \
a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \
a2 = (uint32_t)( t ); \
a3 = (uint32_t)( t<<32 ); \
t = _mm_extract_epi64( hi, 1 ); \
a4 = mm128_mov128_32( hi ); \
a5 = _mm_extract_epi32( hi, 1 ); \
a6 = _mm_extract_epi32( hi, 2 ); \
a7 = _mm_extract_epi32( hi, 3 ); \
a6 = (uint32_t)( t ); \
a7 = (uint32_t)( t<<32 ); \
} while(0)
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
// Move integer to lower bits of vector, upper bits set to zero.
static inline __m256i mm256_mov64_256( uint64_t n )
{
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm256_castsi128_si256( a );
}
static inline __m256i mm256_mov32_256( uint32_t n )
{
register __m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm256_castsi128_si256( a );
}
// Move lo bits of vector to integer, hi bits are truncated.
#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) )
#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) )
// Horizontal vector testing
#if defined(__AVX2__)
#define mm256_allbits0( a ) _mm256_testz_si256( a, a )
#define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 )
@@ -205,21 +127,6 @@ static inline __m256i mm256_mov32_256( uint32_t n )
#define mm256_anybits0 mm256_allbitsne
#define mm256_anybits1 mm256_allbitsne
#else // AVX
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm256_anybits0( a ) \
( (uint128_t)mm128_extr_hi128_256( a ) \
| (uint128_t)mm128_extr_lo128_256( a ) )
#define mm256_anybits1( a ) \
( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
| ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
#endif // AVX2 else AVX
// Parallel AES, for when x is expected to be in a 256 bit register.
// Use same 128 bit key.
@@ -268,12 +175,6 @@ static inline void memset_256( __m256i *dst, const __m256i a, const int n )
static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
///////////////////////////////
//
// AVX2 needed from now on.
//
#if defined(__AVX2__)
//
// Basic operations without SIMD equivalent
@@ -310,26 +211,50 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// The only bit shift for more than 64 bits is with __int128.
//
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
// but is of little value
//
// Rotate each element of v by c bits
#define mm256_ror_64( v, c ) \
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
// necessary.
#define mm256_ror_var_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_64( v, c ) \
#define mm256_rol_var_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_32( v, c ) \
#define mm256_ror_var_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_32( v, c ) \
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
/*
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// AVX512, control must be 8 bit immediate.
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
#else
*/
// No AVX512, use fallback.
#define mm256_ror_64 mm256_ror_var_64
#define mm256_rol_64 mm256_rol_var_64
#define mm256_ror_32 mm256_ror_var_32
#define mm256_rol_32 mm256_rol_var_32
// #endif // AVX512 else
#define mm256_ror_16( v, c ) \
_mm256_or_si256( _mm256_srli_epi16( v, c ), \
_mm256_slli_epi16( v, 16-(c) ) )
@@ -365,12 +290,40 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_set1_epi32( 32 ), c ) ) )
// AVX512 can do 16 bit elements.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm256_rorv_16( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \
_mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
#define mm256_rolv_16( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \
_mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
#endif // AVX512
//
// Rotate elements accross all lanes.
//
// AVX2 has no full vector permute for elements less than 32 bits.
// AVX512 has finer granularity full vector permutes.
// AVX512 has full vector alignr which might be faster, especially for 32 bit
/*
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm256_swap_128( v ) _mm256_alignr_epi64( v, v, 2 )
#define mm256_ror_1x64( v ) _mm256_alignr_epi64( v, v, 1 )
#define mm256_rol_1x64( v ) _mm256_alignr_epi64( v, v, 3 )
#define mm256_ror_1x32( v ) _mm256_alignr_epi32( v, v, 1 )
#define mm256_rol_1x32( v ) _mm256_alignr_epi32( v, v, 7 )
#define mm256_ror_3x32( v ) _mm256_alignr_epi32( v, v, 3 )
#define mm256_rol_3x32( v ) _mm256_alignr_epi32( v, v, 5 )
#else // AVX2
*/
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
@@ -379,7 +332,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// A little faster with avx512
// Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, \
@@ -402,8 +354,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
m256_const_64( 0x0000000400000003, 0x0000000200000001, \
0x0000000000000007, 0x0000000600000005 )
//#endif // AVX512 else AVX2
// AVX512 can do 16 & 8 bit elements.
#if defined(__AVX512VL__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Rotate 256 bit vector by one 16 bit element.
#define mm256_ror_1x16( v ) \
@@ -416,17 +371,48 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
0x000e000d000c000b, 0x000a000900080007, \
0x0006000500040003, 0x000200010000000f ), v )
// Rotate 256 bit vector by one byte.
#define mm256_ror_1x8( v ) m256_const_64( \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 )
#if defined (__AVX512VBMI__)
#define mm256_rol_1x8( v ) m256_const_64( \
// Rotate 256 bit vector by one byte.
#define mm256_ror_1x8( v ) _mm256_permutexvar_epi8( m256_const_64( \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
#define mm256_rol_1x8( v ) _mm256_permutexvar_epi16( m256_const_64( \
0x1e1d1c1b1a191817, 0x161514131211100f, \
0x0e0d0c0b0a090807, 0x060504030201001f )
0x0e0d0c0b0a090807, 0x060504030201001f ), v )
#endif // VBMI
#endif // AVX512
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm256_invert_64 ( v ) _mm256_permute4x64_epi64( v, 0x1b )
#define mm256_invert_32 ( v ) _mm256_permutevar8x32_epi32( v, \
m256_const_64( 0x0000000000000001, 0x0000000200000003 \
0x0000000400000005, 0x0000000600000007 )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7}
#define mm256_invert_16 ( v ) \
_mm256_permutexvar_epi16( m256_const_64( \
0x0000000100020003, 0x0004000500060007, \
0x00080009000a000b, 0x000c000d000e000f ), v )
#if defined(__AVX512VBMI__)
#define mm256_invert_8( v ) \
_mm256_permutexvar_epi8( m256_const_64( \
0x0001020304050607, 0x08090a0b0c0d0e0f, \
0x1011121314151617, 0x18191a1b1c1d1e1f ), v )
#endif // VBMI
#endif // AVX512
//
// Rotate elements within lanes of 256 bit vector.
@@ -439,27 +425,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Rotate each 128 bit lane by one 16 bit element.
#define mm256_ror1x16_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x01000f0e0d0c0b0a, \
0x0908070605040302, \
0x01000f0e0d0c0b0a, \
0x0908070605040302 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x01000f0e0d0c0b0a, \
0x0908070605040302 ) )
#define mm256_rol1x16_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080706, \
0x0504030201000f0e, \
0x0d0c0b0a09080706, \
0x0504030201000f0e ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080706, \
0x0504030201000f0e ) )
// Rotate each 128 bit lane by one byte
#define mm256_ror1x8_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x000f0e0d0c0b0a09, \
0x0807060504030201, \
0x000f0e0d0c0b0a09, \
0x0807060504030201 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x000f0e0d0c0b0a09, \
0x0807060504030201 ) )
#define mm256_rol1x8_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \
0x0504030201000706, \
0x0d0c0b0a09080f0e, \
0x0504030201000706 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706 ) )
// Rotate each 128 bit lane by c bytes.
#define mm256_bror_128( v, c ) \
@@ -473,70 +451,50 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_ror1x16_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x09080f0e0d0c0b0a, \
0x0100070605040302, \
0x09080f0e0d0c0b0a, \
0x0100070605040302 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x09080f0e0d0c0b0a, \
0x0100070605040302 ) )
#define mm256_rol1x16_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706, \
0x0d0c0b0a09080f0e, \
0x0504030201000706 ))
_mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706 ) )
#define mm256_ror1x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x080f0e0d0c0b0a09, \
0x0007060504030201, \
0x080f0e0d0c0b0a09, \
0x0007060504030201 ))
_mm256_shuffle_epi8( v, m256_const2_64( 0x080f0e0d0c0b0a09, \
0x0007060504030201 ) )
#define mm256_rol1x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0e0d0c0b0a09080f, \
0x0605040302010007, \
0x0e0d0c0b0a09080f, \
0x0605040302010007 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0e0d0c0b0a09080f, \
0x0605040302010007 ) )
#define mm256_ror3x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0a09080f0e0d0c0b, \
0x0201000706050403, \
0x0a09080f0e0d0c0b, \
0x0201000706050403 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0a09080f0e0d0c0b, \
0x0201000706050403 ) )
#define mm256_rol3x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \
0x0403020100070605, \
0x0c0b0a09080f0e0d, \
0x0403020100070605 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0c0b0a09080f0e0d, \
0x0403020100070605 ) )
// Swap 16 bit elements in each 32 bit lane
#define mm256_swap16_32( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0b0a09080f0e0d0c, \
0x0302010007060504, \
0x0b0a09080f0e0d0c, \
0x0302010007060504 )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0b0a09080f0e0d0c, \
0x0302010007060504 ) )
//
// Swap bytes in vector elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607, \
0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203, \
0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0e0f0c0d0a0b0809, \
0x0607040502030001, \
0x0e0f0c0d0a0b0809, \
0x0607040502030001 ) )
_mm256_shuffle_epi8( v, m256_const2_64( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 ) )
// Source and destination are pointers, may point to same memory.
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
#define mm256_block_bswap_64( d, s ) do \
{ \
__m256i ctl = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
__m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -550,8 +508,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
#define mm256_block_bswap_32( d, s ) do \
{ \
__m256i ctl = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
__m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -569,6 +526,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Some of these can use permute but appears to be slower. Maybe a Ryzen
// issue
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
#define mm256_swap256_512 (v1, v2) \
v1 = _mm256_xor_si256(v1, v2); \
v2 = _mm256_xor_si256(v1, v2); \
@@ -576,75 +536,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror1x128_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \
v1 = _mm256_alignr_epi8( v2, v1, 16 ); \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \
} while(0)
#define mm256_rol1x128_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \
v2 = _mm256_alignr_epi8( v2, v1, 16 ); \
v1 = t; \
} while(0)
#define mm256_ror1x64_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \
v1 = _mm256_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm256_rol1x64_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 24 ); \
v2 = _mm256_alignr_epi8( v2, v1, 24 ); \
v1 = t; \
} while(0)
#define mm256_ror1x32_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \
v1 = _mm256_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm256_rol1x32_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 28 ); \
v2 = _mm256_alignr_epi8( v2, v1, 28 ); \
v1 = t; \
} while(0)
#define mm256_ror1x16_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 2 ); \
v1 = _mm256_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm256_rol1x16_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 30 ); \
v2 = _mm256_alignr_epi8( v2, v1, 30 ); \
v1 = t; \
} while(0)
#define mm256_ror1x8_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 1 ); \
v1 = _mm256_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm256_rol1x8_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 31 ); \
v2 = _mm256_alignr_epi8( v2, v1, 31 ); \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
v1 = t; \
} while(0)
#endif // __AVX2__
#endif // __AVX__
#endif // SIMD_256_H__

View File

@@ -1,35 +1,32 @@
#if !defined(SIMD_512_H__)
#define SIMD_512_H__ 1
#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
//
// Some extentsions in AVX512 supporting operations on
// smaller elements in 256 bit vectors.
// AVX-512
//
// The baseline for these utilities is AVX512F, AVX512DQ, AVX512BW
// and AVX512VL, first available in quantity in Skylake-X.
// Some utilities may require additional features available in subsequent
// architectures and are noted.
// Variable rotate, each element rotates by corresponding index.
#define mm256_rorv_16( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi16( v, _mm256_set1_epi16( c ) ), \
_mm256_sllv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
#define mm256_rolv_16( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi16( v, _mm256_set1_epi16( c ) ), \
_mm256_srlv_epi16( v, _mm256_set1_epi16( 16-(c) ) ) )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7}
#define mm256_invert_16 ( v ) \
_mm256_permutex_epi16( v, _mm256_set_epi16( 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15 ) )
#define mm256_invert_8( v ) \
_mm256_permutex_epi8( v, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15, \
16,17,18,19,20,21,22,23, \
24,25,26,27,28,29,30,31 ) )
// AVX512 intrinsics have a few peculiarities with permutes and shuffles
// that are inconsistent with previous AVX2 implementations.
//
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
// usually shuffles accross all lanes.
//
// permutexvar has args reversed, index is first arg. Previously all
// permutes and shuffles have the source vector first.
//
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
// It also performs the same op as _mm512_shuffle_epi8.
//
// _mm512_shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
// doesn't cross 128 bit lane boundaries.
//////////////////////////////////////////////////////////////
//
@@ -40,6 +37,84 @@
//
// Experimental, not fully tested.
// Move integer to/from element 0 of vector.
#define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
// Insert and extract integers is a multistage operation.
// Insert integer into __m128i, then insert __m128i to __m256i, finally
// insert __256i into __m512i. Reverse the order for extract.
// Do not use __m512_insert_epi64 or _mm256_insert_epi64 to perform multiple
// inserts.
// Avoid small integers for multiple inserts.
// Shortcuts:
// Use castsi to reference the low bits of a vector or sub-vector. (free)
// Use mov to insert integer into low bits of vector or sub-vector. (cheap)
// Use _mm_insert only to reference the high bits of __m128i. (expensive)
// Sequence instructions to minimize data dependencies.
// Use const or const1 only when integer is either immediate or known to be in
// a GP register. Use set/set1 when data needs to be loaded from memory or
// cache.
// Concatenate two 256 bit vectors into one 512 bit vector {hi, lo}
#define mm512_concat_256( hi, lo ) \
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
// Equivalent of set, assign 64 bit integers to respective 64 bit elements.
static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
const uint64_t i5, const uint64_t i4,
const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{
__m256i hi, lo;
__m128i hi1, lo1;
lo = mm256_mov64_256( i0 );
lo1 = mm128_mov64_128( i2 );
hi = mm256_mov64_256( i4 );
hi1 = mm128_mov64_128( i6 );
lo = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128( lo ), i1, 1 ) );
lo1 = _mm_insert_epi64( lo1, i3, 1 );
hi = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128( hi ), i5, 1 ) );
hi1 = _mm_insert_epi64( hi1, i7, 1 );
lo = _mm256_inserti128_si256( lo, lo1, 1 );
hi = _mm256_inserti128_si256( hi, hi1, 1 );
return mm512_concat_256( hi, lo );
}
// Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
// to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{
__m256i lo = mm256_mov64_256( i0 );
__m128i hi = mm128_mov64_128( i2 );
lo = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128(
lo ), i1, 1 ) );
hi = _mm_insert_epi64( hi, i3, 1 );
return _mm512_permutex_epi64( _mm512_castsi256_si512(
_mm256_inserti128_si256( lo, hi, 1 ) ), 0xe4 );
}
// Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
// 128 bit lanes.
#define mm512_const2_64( i1, i0 ) \
_mm512_permutex_epi64( _mm512_castsi128_si512( \
m128_const_64( i1, i0 ) ), 0x44 )
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
#define m512_const1_8 ( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
//
// Pseudo constants.
@@ -49,90 +124,26 @@
// initialized to zero.
#define m512_zero _mm512_setzero_si512()
#define m512_one_512 _mm512_set_epi64( 0ULL, 0ULL, 0ULL, 0ULL, \
0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_256 _mm512_set4_epi64( 0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_128 _mm512_set4_epi64( 0ULL, 1ULL, 0ULL, 1ULL )
//#define m512_one_64 _mm512_set1_epi64( 1ULL )
//#define m512_one_32 _mm512_set1_epi32( 1UL )
//#define m512_one_16 _mm512_set1_epi16( 1U )
//#define m512_one_8 _mm512_set1_epi8( 1U )
//#define m512_neg1 _mm512_set1_epi64( 0xFFFFFFFFFFFFFFFFULL )
#define m512_one_512 mm512_mov64_512( 1 )
#define m512_one_256 _mm512_broadcast_i64x4 ( mm256_mov64_256( 1 ) )
#define m512_one_128 _mm512_broadcast_i64x2 ( mm128_mov64_128( 1 ) )
#define m512_one_64 _mm512_broadcastq_epi64( mm128_mov64_128( 1 ) )
#define m512_one_32 _mm512_broadcastd_epi32( mm128_mov64_128( 1 ) )
#define m512_one_16 _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) )
#define m512_one_8 _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
#define m512_const_64( i7, i6, i5, i4, i3, i2, i1, i0 ) \
_mm512_inserti64x4( _mm512_castsi256_si512( m256_const_64( i3,i2,i1,i0 ) ), \
m256_const_64( i7,i6,i5,i4 ), 1 )
#define m512_neg1 mm512_const1_64( 0xffffffffffffffff )
static inline __m512i m512_const1_64( uint64_t i )
{
register __m128i a;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r"(i) );
return _mm512_broadcastq_epi64( a );
}
static inline __m512i m512_one_64_fn()
{
__m512i a;
asm( "vpxorq %0, %0, %0\n\t"
"vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t"
"vpsubq %%zmm1, %0, %0\n\t"
:"=x"(a)
:
: "zmm1" );
return a;
}
#define m512_one_64 m512_one_64_fn()
static inline __m512i m512_one_32_fn()
{
__m512i a;
asm( "vpxord %0, %0, %0\n\t"
"vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t"
"vpsubd %%zmm1, %0, %0\n\t"
:"=x"(a)
:
: "zmm1" );
return a;
}
#define m512_one_32 m512_one_32_fn()
static inline __m512i m512_one_16_fn()
{
__m512i a;
asm( "vpxord %0, %0, %0\n\t"
"vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t"
"vpsubw %%zmm1, %0, %0\n\t"
:"=x"(a)
:
: "zmm1" );
return a;
}
#define m512_one_16 m512_one_16_fn()
static inline __m512i m512_one_8_fn()
{
__m512i a;
asm( "vpxord %0, %0, %0\n\t"
"vpcmpeqd %%zmm1, %%zmm1, %%zmm1\n\t"
"vpsubb %%zmm1, %0, %0\n\t"
:"=x"(a)
:
: "zmm1" );
return a;
}
#define m512_one_8 m512_one_8_fn()
static inline __m512i m512_neg1_fn()
/*
// EVEX vcmpeqq returns a bit mask instead of a vector
static inline __m512i mm512_neg1_fn()
{
__m512i a;
asm( "vpcmpeqq %0, %0, %0\n\t"
:"=x"(a) );
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
return a;
}
#define m512_neg1 m512_neg1_fn()
#define m512_neg1 mm512_neg1_fn()
*/
//
// Basic operations without SIMD equivalent
@@ -142,11 +153,6 @@ static inline __m512i m512_neg1_fn()
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
#define mm256_extr_lo256_512( a ) _mm512_castsi512_si256( a )
#define mm256_extr_hi256_512( a ) _mm512_extracti64x4_epi64( a, 1 )
#define mm128_extr_lo128_512( a ) _mm512_castsi512_si256( a )
//
// Pointer casting
@@ -168,7 +174,7 @@ static inline __m512i m512_neg1_fn()
#define casto_m512i(p,o) (((__m512i*)(p))+(o))
// Add 4 values, fewer dependencies than sequential addition.
// Sum 4 values, fewer dependencies than sequential addition.
#define mm512_add4_64( a, b, c, d ) \
_mm512_add_epi64( _mm512_add_epi64( a, b ), _mm512_add_epi64( c, d ) )
@@ -186,16 +192,44 @@ static inline __m512i m512_neg1_fn()
_mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) )
// Horizontal vector testing
// Returns bit mask
#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero )
#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 )
#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 )
#define mm512_anybits1( a ) _mm512_cmpneq_epi64_mask( a, m512_zero )
//
// Bit rotations.
// AVX512F has built-in bit fixed and variable rotation for 64 & 32 bit
// elements. There is no bit rotation or shift for larger elements.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
// elements and can be called directly. But they only accept immediate 8
// for control arg.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
//
// Here is a bit rotate for 16 bit elements:
#define mm512_ror_var_64( v, c ) \
_mm512_or_si512( _mm512_srli_epi64( v, c ), \
_mm512_slli_epi64( v, 64-(c) ) )
#define mm512_rol_var_64( v, c ) \
_mm512_or_si512( _mm512_slli_epi64( v, c ), \
_mm512_srli_epi64( v, 64-(c) ) )
#define mm512_ror_var_32( v, c ) \
_mm512_or_si512( _mm512_srli_epi32( v, c ), \
_mm512_slli_epi32( v, 32-(c) ) )
#define mm512_rol_var_32( v, c ) \
_mm512_or_si512( _mm512_slli_epi32( v, c ), \
_mm512_srli_epi32( v, 32-(c) ) )
// Here is a fixed bit rotate for 16 bit elements:
#define mm512_ror_16( v, c ) \
_mm512_or_si512( _mm512_srli_epi16( v, c ), \
_mm512_slli_epi16( v, 16-(c) )
@@ -203,6 +237,59 @@ static inline __m512i m512_neg1_fn()
_mm512_or_si512( _mm512_slli_epi16( v, c ), \
_mm512_srli_epi16( v, 16-(c) )
// Rotations using a vector control index are very slow due to overhead
// to generate the index vector. Repeated rotations using the same index
// are better handled by the calling function where the index only needs
// to be generated once then reused very efficiently.
// Permutes and shuffles using an immediate index are significantly faster.
//
// Swap bytes in vector elements, vectorized endian conversion.
#define mm512_bswap_64( v ) \
_mm512_shuffle_epi8( v, m512_const2_64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
#define mm512_bswap_32( v ) \
_mm512_shuffle_epi8( v, m512_const2_64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, m512_const2_64( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )
// Source and destination are pointers, may point to same memory.
// 8 lanes of 64 bytes each
#define mm512_block_bswap_64( d, s ) do \
{ \
__m512i ctl = m512_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
// 16 lanes of 32 bytes each
#define mm512_block_bswap_32( d, s ) do \
{ \
__m512i ctl = m512_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
//
// Rotate elements in 512 bit vector.
@@ -219,63 +306,66 @@ static inline __m512i m512_neg1_fn()
// Generic for odd rotations
#define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n )
#define mm512_rol_x64( v, n ) _mm512_alignr_epi64( v, v, 8-n )
#define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n )
#define mm512_rol_x32( v, n ) _mm512_alignr_epi32( v, v, 16-n )
#define mm512_ror_1x16( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000001F001E001D, 0x001C001B001A0019, \
0X0018001700160015, 0X0014001300120011, \
0X0010000F000E000D, 0X000C000B000A0009, \
0X0008000700060005, 0X0004000300020001 ) )
0X0008000700060005, 0X0004000300020001 ), v )
#define mm512_rol_1x16( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
_mm512_permutexvar_epi16( m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0X0016001500140013, 0X001200110010000F, \
0X000E000D000C000B, 0X000A000900080007, \
0X0006000500040003, 0X000200010000001F ) )
0X0006000500040003, 0X000200010000001F ), v )
#define mm512_ror_1x8( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x003F3E3D3C3B3A39, 0x3837363534333231, \
0x302F2E2D2C2B2A29, 0x2827262524232221, \
0x201F1E1D1C1B1A19. 0x1817161514131211, \
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
#define mm512_rol_1x8( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3E3D3C3B3A393837, 0x363534333231302F. \
0x2E2D2C2B2A292827, 0x262524232221201F, \
0x1E1D1C1B1A191817, 0x161514131211100F, \
0x0E0D0C0B0A090807, 0x060504030201003F ) )
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm512_invert_128( v ) _mm512_permute4f128_epi32( a, 0x1b )
#define mm512_invert_256( v ) \
_mm512_permutexvar_epi64( v, m512_const_64( 3,2,1,0,7,6,5,4 ) )
#define mm512_invert_128( v ) \
_mm512_permutexvar_epi64( v, m512_const_64( 1,0,3,2,5,4,7,6 ) )
#define mm512_invert_64( v ) \
_mm512_permutex_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) )
_mm512_permutexvar_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) )
#define mm512_invert_32( v ) \
_mm512_permutexvar_epi32( v, m512_const_64( \
_mm512_permutexvar_epi32( m512_const_64( \
0x0000000000000001,0x0000000200000003, \
0x0000000400000005,0x0000000600000007, \
0x0000000800000009,0x0000000a0000000b, \
0x0000000c0000000d,0x0000000e0000000f ) )
0x0000000c0000000d,0x0000000e0000000f ), v )
#define mm512_invert_16( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000000100020003, 0x0004000500060007, \
0x00080009000A000B, 0x000C000D000E000F, \
0x0010001100120013, 0x0014001500160017, \
0x00180019001A001B, 0x001C001D001E001F ) )
0x00180019001A001B, 0x001C001D001E001F ), v )
#define mm512_invert_8( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x0001020304050607, 0x08090A0B0C0D0E0F, \
0x1011121314151617, 0x18191A1B1C1D1E1F, \
0x2021222324252627, 0x28292A2B2C2D2E2F, \
@@ -293,84 +383,60 @@ static inline __m512i m512_neg1_fn()
// Rotate 256 bit lanes by one 32 bit element
#define mm512_ror1x32_256( v ) \
_mm512_permutexvar_epi32( v, m512_const_64( \
0x000000080000000f, 0x0000000e0000000d, \
0x0000000c0000000b, 0x0000000a00000009, \
_mm512_permutexvar_epi32( m512_const4_64( \
0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ) )
0x0000000400000003, 0x0000000200000001 ), v )
#define mm512_rol1x32_256( v ) \
_mm512_permutexvar_epi32( v, m512_const_64( \
0x0000000e0000000d, 0x0000000c0000000b, \
0x0000000a00000009, 0x000000080000000f, \
_mm512_permutexvar_epi32( m512_const4_64( \
0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ) )
0x0000000200000001, 0x0000000000000007 ), v )
#define mm512_ror1x16_256( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
0x0010001F001E001D, 0x001C001B001A0019, \
0x0018001700160015, 0x0014001300120011, \
0x0000000F000E000D, 0x000C000B000A0009, \
0x0008000700060005, 0x0004000300020001 ) )
_mm512_permutexvar_epi16( m512_const4_64( \
0x0000000f000e000d, 0x000c000b000a0009, \
0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_256( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0x0016001500140013, 0x001200110000000F, \
0x000E000D000C000B, 0x000A000900080007, \
0x0006000500040003, 0x000200010000001F ) )
_mm512_permutexvar_epi16( m512_const4_64( \
0x000e000d000c000b, 0x000a000900080007, \
0x0006000500040003, 0x000200010000000f ), v )
#define mm512_ror1x8_256( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x203F3E3D3C3B3A39, 0x3837363534333231, \
0x302F2E2D2C2B2A29, 0x2827262524232221, \
0x001F1E1D1C1B1A19, 0x1817161514131211, \
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
_mm512_shuffle_epi8( v, m512_const4_64( \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
#define mm512_rol1x8_256( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x3E3D3C3B3A393837, 0x363534333231302F, \
0x2E2D2C2B2A292827, 0x262524232221203F, \
0x1E1D1C1B1A191817, 0x161514131211100F, \
0x0E0D0C0B0A090807, 0x060504030201001F ) )
_mm512_shuffle_epi8( v, m512_const4_64( \
0x1e1d1c1b1a191817, 0x161514131211100f, \
0x0e0d0c0b0a090807, 0x060504030201001f ), v )
//
// Rotate elements within 128 bit lanes of 512 bit vector.
// Swap hi & lo 64 bits in each 128 bit lane
#define mm512_swap64_128( v ) _mm512_permutex_epi64( v, 0xb1 )
#define mm512_swap64_128( v ) _mm512_shuffle_epi32( v, 0x4e )
// Rotate 128 bit lanes by one 32 bit element
#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_ror1x16_128( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
0x0018001F001E001D, 0x001C001B001A0019, \
0x0010001700160015, 0x0014001300120011, \
0x0008000F000E000D, 0x000C000B000A0009, \
0x0000000700060005, 0x0004000300020001 ) )
_mm512_permutexvar_epi16( m512_const2_64( \
0x0000000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_128( v ) \
_mm512_permutexvar_epi16( v, m512_const_64( \
0x001E001D001C001B, 0x001A00190018001F, \
0x0016001500140013, 0x0012001100100017, \
0x000E000D000C000B, 0x000A00090008000F, \
0x0006000500040003, 0x0002000100000007 ) )
_mm512_permutexvar_epi16( m512_const2_64( \
0x0006000500040003, 0x0002000100000007 ), v )
#define mm512_ror1x8_128( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x303F3E3D3C3B3A39, 0x3837363534333231, \
0x202F2E2D2C2B2A29, 0x2827262524232221, \
0x101F1E1D1C1B1A19, 0x1817161514131211, \
0x000F0E0D0C0B0A09, 0x0807060504030201 ) )
_mm512_shuffle_epi8( v, m512_const2_64( \
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
#define mm512_rol1x8_128( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x3E3D3C3B3A393837, 0x363534333231303F, \
0x2E2D2C2B2A292827, 0x262524232221202F, \
0x1E1D1C1B1A191817, 0x161514131211101F, \
0x0E0D0C0B0A090807, 0x060504030201000F ) )
_mm512_shuffle_epi8( v, m512_const2_64( \
0x0e0d0c0b0a090807, 0x060504030201000f ) )
// Rotate 128 bit lanes by c bytes.
#define mm512_bror_128( v, c ) \
@@ -387,32 +453,30 @@ static inline __m512i m512_neg1_fn()
// Swap 32 bit elements in each 64 bit lane
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
// _mm512_set_epi8 doesn't seem to work
// Rotate each 64 bit lane by one 16 bit element.
#define mm512_ror1x16_64( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x39383F3E3D3C3B3A, 0x3130373635343332, \
0x29282F2E2D2C2B2A, 0x2120272625242322, \
0x19181F1E1D1C1B1A, 0x1110171615141312, \
0x09080F0E0D0C0B0A, 0x0100070605040302 ) )
_mm512_permutexvar_epi16( m512_const_64( \
0x001c001f001e001d, 0x0018001b001a0019, \
0x0014001700160015, 0x0010001300120011, \
0x000c000f000e000d, 0x0008000b000a0009, \
0x0004000700060005, 0x0000000300020001, v )
#define mm512_rol1x16_64( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x3D3C3B3A39383F3E, 0x3534333231303736 \
0x2D2C2B2A29282F2E, 0x2524232221202726 \
0x1D1C1B1A19181F1E, 0x1514131211101716 \
0x0D0C0B0A09080F0E, 0x0504030201000706 ) )
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001f, 0x001a00190018001b, \
0x0016001500140017, 0x0012001100100013, \
0x000e000d000c000f, 0x000a00090008000b, \
0x0006000500040007, 0x0002000100000003, v )
// Rotate each 64 bit lane by one byte.
#define mm512_ror1x8_64( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x383F3E3D3C3B3A39, 0x3037363534333231, \
0x282F2E2D2C2B2A29, 0x2027262524232221, \
0x181F1E1D1C1B1A19, 0x1017161514131211, \
0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
#define mm512_rol1x8_64( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle( v, m512_const_64( \
0x3E3D3C3B3A39383F, 0x3635343332313037, \
0x2E2D2C2B2A29282F, 0x2625242322212027, \
0x1E1D1C1B1A19181F, 0x1615141312111017, \
@@ -422,55 +486,31 @@ static inline __m512i m512_neg1_fn()
// Rotate elements within 32 bit lanes.
#define mm512_swap16_32( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x001D001C001F001E, 0x00190018001B001A, \
0x0015001400170016, 0x0011001000130012, \
0x000D000C000F000E, 0x00190008000B000A, \
0x0005000400070006, 0x0011000000030002 ) )
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001f001c001d, 0x001a001b00180019, \
0x0016001700140015, 0x0012001300100011, \
0x000e000f000c000d, 0x000a000b00080009, \
0x0006000700040005, 0x0002000300000001 ), v )
#define mm512_ror1x8_32( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3C3F3E3D383B3A39, 0x3437363530333231, \
0x2C2F2E2D282B2A29, 0x2427262520232221, \
0x1C1F1E1D181B1A19, 0x1417161510131211, \
0x0C0F0E0D080B0A09, 0x0407060500030201 ) )
0x0C0F0E0D080B0A09, 0x0407060500030201 ))
#define mm512_rol1x8_32( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3E3D3C3F3A39383B, 0x3635343732313033, \
0x2E2D2C2F2A29282B, 0x2625242722212023, \
0x1E1D1C1F1A19181B, 0x1615141712111013, \
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
//
// Swap bytes in vector elements, vectorized bswap.
#define mm512_bswap_64( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x38393A3B3C3D3E3F, 0x2031323334353637, \
0x28292A2B2C2D2E2F, 0x2021222334353637, \
0x18191A1B1C1D1E1F, 0x1011121314151617, \
0x08090A0B0C0D0E0F, 0x0001020304050607 ) )
#define mm512_bswap_32( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x3C3D3E3F38393A3B, 0x3435363730313233, \
0x3C3D3E3F38393A3B, 0x3435363730313233, \
0x3C3D3E3F38393A3B, 0x3435363730313233, \
0x3C3D3E3F38393A3B, 0x3435363730313233 ) )
#define mm512_bswap_16( v ) \
_mm512_permutexvar_epi8( v, m512_const_64( \
0x3E3F3C3D3A3B3839, 0x3637343532333031, \
0x2E2F2C2D2A2B2829, 0x2627242522232021, \
0x1E1F1C1D1A1B1819, 0x1617141512131011, \
0x0E0F0C0D0A0B0809, 0x0607040502030001 ) )
//
// Rotate elements from 2 512 bit vectors in place, source arguments
// are overwritten.
// These can all be done with 2 permutex2var instructions but they are
// slower than either xor or alignr.
// slower than either xor or alignr and require AVX512VBMI.
#define mm512_swap512_1024(v1, v2) \
v1 = _mm512_xor_si512(v1, v2); \
@@ -533,33 +573,5 @@ do { \
v1 = t; \
} while(0)
#define mm512_ror1x16_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 2 ); \
v1 = _mm512_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_rol1x16_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 62 ); \
v2 = _mm512_alignr_epi8( v2, v1, 62 ); \
v1 = t; \
} while(0)
#define mm512_ror1x8_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 1 ); \
v1 = _mm512_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1x8_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 63 ); \
v2 = _mm512_alignr_epi8( v2, v1, 63 ); \
v1 = t; \
} while(0)
#endif // AVX512
#endif // SIMD_512_H__

View File

@@ -1,5 +1,5 @@
#if !defined(SIMD_SCALAR_H__)
#define SIMD_SCALAR_H__ 1
#if !defined(SIMD_INT_H__)
#define SIMD_INT_H__ 1
///////////////////////////////////
//
@@ -13,6 +13,8 @@
// Some utilities are also provided for smaller integers, most notably
// bit rotation.
// MMX has no extract instruction for 32 bit elements so this:
// Lo is trivial, high is a simple shift.
// Input may be uint64_t or __m64, returns uint32_t.
@@ -56,18 +58,45 @@ static inline void memset_zero_64( uint64_t *src, int n )
static inline void memset_64( uint64_t *dst, const uint64_t a, int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
#if defined (GCC_INT128)
///////////////////////////////////////
//
// 128 bit integers
//
// 128 bit integers are inneficient and not a shortcut for __m128i.
// Native type __int128 supported starting with GCC-4.8.
//
// __int128 uses two 64 bit GPRs to hold the data. The main benefits are
// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
// is not required. int128 also works better with other integer sizes.
// Vectors benefit from wider registers.
//
// For safety use typecasting on all numeric arguments.
//
// Use typecasting for conversion to/from 128 bit vector:
// __m128i v128 = (__m128i)my_int128l
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
// No real need or use.
//#define u128_neg1 ((uint128_t)(-1))
// Compiler check for __int128 support
// Configure also has a test for int128.
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#define GCC_INT128 1
#endif
// usefull for making constants.
#if !defined(GCC_INT128)
#warning "__int128 not supported, requires GCC-4.8 or newer."
#endif
#if defined(GCC_INT128)
// Familiar looking type names
typedef __int128 int128_t;
typedef unsigned __int128 uint128_t;
// Maybe usefull for making constants.
#define mk_uint128( hi, lo ) \
( ( (uint128_t)(hi) << 64 ) | ( (uint128_t)(lo) ) )
@@ -92,6 +121,6 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n )
#endif // GCC_INT128
#endif // SIMD_SCALAR_H__
#endif // SIMD_INT_H__

View File

@@ -1,389 +0,0 @@
//////////////////////////////////////
//
// Type abstraction overlays designed for use in highly optimized
// straight line code operating on array structures. It uses direct
// struct member access instead of indexing to access array elements.
// Ex: array.u32_3 instead of array[3].
//
// Vector types are used to represent asrrays. 64 and 128 bit vectors have
// corresponding 64 and 128 bit integer types.
//
// Data accesses are not tied to memory as arrays are. Thes structures
// can operate comfortably as reguietr variables.
//
// Although the abstraction makes for transparent usage there is overhead.
// Extra move instructins are required when an operation requires a
// different register type. Additionaly 128 bit operations, uint128_t
// and AES, can't be done in parallel with a 256 bit or lager vector.
// The require additionalmove instructions in addition to the lack of
// improvement from parallelism.
//
// Move instruction overhead is required when moving among gpr, mmx
// and xmm registers. The number of extra moves is usually the number
// of elements inthe vector. If bothe are the same size onlu one move
// is required. The number is doubled if the data is moved back.
//
// xmm and ymm resgisters are special, they are aliased. xmm registers
// overlay the lower 128 bits of the ymm registers. Accessing the data
// in the lower half of a ymm register by an xmm argument is free.
// The upper 128 bits need to be extracted and inserted like with other
// different sized data types.
//
// Integer types can be converted to differently sized integers without
// penalty.
//
// Conversions with penalty should be avoided as much possible by grouping
// operations requiring the same register set.
//
// There are two algorithms for extracting and inserting data.
//
// There isthe straightforward iterative meathod wher each element is
// extracted or inserted in turn. The compiler evidently take a different
// aproach based on assembly code generated by a set intrinsic.
// To extract 64 bit or smaller elements from a 256 bit vector the
// first extracts the upper 128 bit into a second xmm register. This
// eliminates a dependency between the upper and lower elements allowing
// the CPU more opportunity at multiple operations per clock.
// This adds one additional instruction to the process. With AVX512 an
// another stege is added by first splitting up the 512 bit vector into
// 2 256 bit vectors,
//
// xmm/ymm aliasing makes accessing low half trivial and without cost.
// Accessing the upper half requires a move from the upper half of
// the source register to the lower half of the destination.
// It's a bigger issue with GPRs as there is no aliasing.
//
// Theoretically memory resident data could bypass the move and load
// the data directly into the desired register type. However this
// ignores the overhead to ensure coherency between register and memory
// wich is significantly more.
//
// Overlay avoids pointer dereferences and favours register move over
// memory load, notwistanding compiler optimization.
//
// The syntax is ugly but can be abstracted with macros.
// Universal 64 bit overlay
// Avoids arrays and pointers, suitable as register variable.
// Conversions are transparent but not free, cost is one MOV instruction.
// Facilitates manipulating 32 bit data in 64 bit pairs.
// Allows full use of 64 bit registers for 32 bit data, effectively doubling
// the size of the register set.
// Potentially up to 50% reduction in instructions depending on rate of
// conversion.
///////////////////////////////////////////////////////
//
// 128 bit integer
//
// Native type __int128 supported starting with GCC-4.8.
//
// __int128 uses two 64 bit GPRs to hold the data. The main benefits are
// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
// is not required. int128 also works better with other integer sizes.
// Vectors benefit from wider registers.
//
// For safety use typecasting on all numeric arguments.
//
// Use typecasting for conversion to/from 128 bit vector:
// __m128i v128 = (__m128i)my_int128l
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
// Compiler check for __int128 support
// Configure also has a test for int128.
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#define GCC_INT128 1
#endif
#if !defined(GCC_INT128)
#warning "__int128 not supported, requires GCC-4.8 or newer."
#endif
#if defined(GCC_INT128)
// Familiar looking type names
typedef __int128 int128_t;
typedef unsigned __int128 uint128_t;
#endif
/////////////////////////////////////
//
// MMX 64 bit vector
//
// Emulates uint32_t[2]
struct _regarray_u32x2
{
uint32_t _0; uint32_t _1;
};
typedef struct _regarray_u32x2 regarray_u32x2;
// Emulates uint16_t[4]
struct _regarray_u16x4
{
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
};
typedef struct _regarray_u16x4 regarray_u16x4;
// Emulates uint8_t[8]
struct _regarray_u8x8
{
uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3;
uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7;
};
typedef struct _regarray_u8x8 regarray_u8x8;
// universal 64 bit overlay
union _regarray_64
{
regarray_u32x2 u32_; // uint32_t[2]
regarray_u16x4 u16_; // uint16_t[4]
regarray_u8x8 u8_; // uint8_t[8]
uint64_t u64;
__m64 v64;
};
typedef union _regarray_64 regarray_64;
/////
//
// SSE2
// Universal 128 bit overlay
//
// Avoids arrays and pointers, suitable as register variable.
// Designed for speed in straight line code with no loops.
//
// Conversions are transparent but not free, cost is one MOV instruction
// in each direction, except for lower half of ymm to/from xmm which are
// free.
//
// Facilitates two dimensional vectoring.
//
// 128 bit integer and AES can't be done in parallel. AES suffers extraction
// and insertion of the upper 128 bits. uint128_t suffers 4 times the cost
// with 2 64 bit extractions and 2 insertions for each 128 bit lane with
// single stage ymm <--> gpr for a total of 8 moves.
//
// Two stage conversion is possible which helps CPU instruction scheduling
// by removing a register dependency between the upper and lower 128 at the
// cost of two extra instructions (128 bit extract and insert. The compiler
// seems to prefer the 2 staged approach when using the set intrinsic.
// Use macros to simplify array access emulation.
// emulated array type: uint64_t a[4];
// array indexing: a[0], a[1]
// overlay emulation: a.u64_0, a.u64_1
// without macro: a.u64_._0, a.u64_._1
struct _regarray_u64x2
{
uint64_t _0; uint64_t _1;
};
typedef struct _regarray_u64x2 regarray_u64x2;
struct _regarray_v64x2
{
__m64 _0; __m64 _1;
};
typedef struct _regarray_v64x2 regarray_v64x2;
struct _regarray_u32x4
{
uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3;
};
typedef struct _regarray_u32x2 regarray_u32x4;
struct _regarray_u16x8
{
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7;
};
typedef struct _regarray_u16x4 regarray_u16x4;
struct _regarray_u8x16
{
uint8_t _0; uint8_t _1; uint8_t _2; uint8_t _3;
uint8_t _4; uint8_t _5; uint8_t _6; uint8_t _7;
uint8_t _8; uint8_t _9; uint8_t _a; uint8_t _b;
uint8_t _c; uint8_t _d; uint8_t _e; uint8_t _f;
};
typedef struct _regarray_u8x16 regarray_u8x16;
union _register_array_m128v
{
#if defined(GCC_INT128)
uint128_t u128;
#endif
__m128i v128;
regarray_u64x2 u64_; // uint64_t[2]
regarray_v64x2 v64_; // __m64[2]
regarray_u32x4 u32_; // uint32_t[4]
regarray_u16x4 u16_; // uint16_t[8]
regarray_u8x16 u8_; // uint8_t[16]
};
typedef union _register_array_m128v register_array_m128v;
///////////////////
//
// AVX2
//
struct _regarray_v128x2
{
__m128i _0; __m128i _1;
};
typedef struct _regarray_v128x2 regarray_v128x2;
struct _regarray_u128x2
{
uint128_t _0; uint128_t _1;
};
typedef struct _regarray_u128x2 regarray_u128x2;
struct _regarray_u64x4
{
uint64_t _0; uint64_t _1; uint64_t _2; uint64_t _3;
};
typedef struct _regarray_u64x4 regarray_u64x4;
struct _regarray_v64x4
{
__m64 _0; __m64 _1; __m64 _2; __m64 _3;
};
typedef struct _regarray_v64x4 regarray_v64x4;
struct _regarray_u32x8
{
uint32_t _0; uint32_t _1; uint32_t _2; uint32_t _3;
uint32_t _4; uint32_t _5; uint32_t _6; uint32_t _7;
};
typedef struct _regarray_u32x8 regarray_u32x8;
struct _regarray_u16x16
{
uint16_t _0; uint16_t _1; uint16_t _2; uint16_t _3;
uint16_t _4; uint16_t _5; uint16_t _6; uint16_t _7;
uint16_t _8; uint16_t _9; uint16_t _a; uint16_t _b;
uint16_t _c; uint16_t _d; uint16_t _e; uint16_t _f;
};
typedef struct _regarray_u16x16 regarray_u16x16;
struct _regarray_u8x32
{
uint8_t _00; uint8_t _01; uint8_t _02; uint8_t _03;
uint8_t _04; uint8_t _05; uint8_t _06; uint8_t _07;
uint8_t _08; uint8_t _09; uint8_t _0a; uint8_t _0b;
uint8_t _0c; uint8_t _0d; uint8_t _0e; uint8_t _0f;
uint8_t _10; uint8_t _11; uint8_t _12; uint8_t _13;
uint8_t _14; uint8_t _15; uint8_t _16; uint8_t _17;
uint8_t _18; uint8_t _19; uint8_t _1a; uint8_t _1b;
uint8_t _1c; uint8_t _1d; uint8_t _1e; uint8_t _1f;
};
typedef struct _regarray_u8x32 regarray_u8x32;
union _regarray_v256
{
__m256i v256;
#if defined(GCC_INT128)
regarray_u128x2 u128_; // uint128_t[2]
#endif
regarray_v128x2 v128_; // __m128i[2]
regarray_v64x4 v64_;
regarray_u64x4 u64_;
regarray_u32x8 u32_;
regarray_u16x16 u16_;
regarray_u8x32 u8_;
};
typedef union _regarray_v256 regarray_v256;
////////////
//
// Abstraction macros to allow easy readability.
// Users may define their own list to suit their preferences
// such as, upper case hex, leading zeros, multidimensional,
// alphabetic, day of week, etc..
#define v128_0 v128_._0
#define v128_1 v128_._1
#define u128_0 u128_._0
#define u128_1 u128_._1
#define v64_0 v64_._0
#define v64_1 v64_._1
#define v64_2 v64_._2
#define v64_3 v64_._3
#define u64_0 u64_._0
#define u64_1 u64_._1
#define u64_2 u64_._2
#define u64_3 u64_._3
#define u32_0 u32_._0
#define u32_1 u32_._1
#define u32_2 u32_._2
#define u32_3 u32_._3
#define u32_4 u32_._4
#define u32_5 u32_._5
#define u32_6 u32_._6
#define u32_7 u32_._7
#define u16_0 u16_._0
#define u16_1 u16_._1
#define u16_2 u16_._2
#define u16_3 u16_._3
#define u16_4 u16_._4
#define u16_5 u16_._5
#define u16_6 u16_._6
#define u16_7 u16_._7
#define u16_8 u16_._8
#define u16_9 u16_._9
#define u16_a u16_._a
#define u16_b u16_._b
#define u16_c u16_._c
#define u16_d u16_._d
#define u16_e u16_._e
#define u16_f u16_._f
#define u8_00 u8_._00
#define u8_01 u8_._01
#define u8_02 u8_._02
#define u8_03 u8_._03
#define u8_04 u8_._04
#define u8_05 u8_._05
#define u8_06 u8_._06
#define u8_07 u8_._07
#define u8_08 u8_._08
#define u8_09 u8_._09
#define u8_0a u8_._0a
#define u8_0b u8_._0b
#define u8_0c u8_._0c
#define u8_0d u8_._0d
#define u8_0e u8_._0e
#define u8_0f u8_._0f
#define u8_10 u8_._10
#define u8_11 u8_._11
#define u8_12 u8_._12
#define u8_13 u8_._13
#define u8_14 u8_._14
#define u8_15 u8_._15
#define u8_16 u8_._16
#define u8_17 u8_._17
#define u8_18 u8_._18
#define u8_19 u8_._19
#define u8_1a u8_._1a
#define u8_1b u8_._1b
#define u8_1c u8_._1c
#define u8_1d u8_._1d
#define u8_1e u8_._1e
#define u8_1f u8_._1f

View File

@@ -1,8 +1,13 @@
#if !defined(SYSINJFOS_C___)
#define SYSINFOS_C__
/**
* Unit to read cpu informations
*
* tpruvot 2014
*/
* JayDDee 2019
*
*/
#include <stdio.h>
#include <ctype.h>
@@ -28,7 +33,7 @@
#define HWMON_ALT5 \
"/sys/class/hwmon/hwmon0/device/temp1_input"
static float linux_cputemp(int core)
static inline float linux_cputemp(int core)
{
float tc = 0.0;
FILE *fd = fopen(HWMON_PATH, "r");
@@ -60,7 +65,7 @@ static float linux_cputemp(int core)
#define CPUFREQ_PATH \
"/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq"
static uint32_t linux_cpufreq(int core)
static inline uint32_t linux_cpufreq(int core)
{
FILE *fd = fopen(CPUFREQ_PATH, "r");
uint32_t freq = 0;
@@ -76,7 +81,7 @@ static uint32_t linux_cpufreq(int core)
#else /* WIN32 */
static float win32_cputemp(int core)
static inline float win32_cputemp(int core)
{
// todo
return 0.0;
@@ -88,7 +93,7 @@ static float win32_cputemp(int core)
/* exports */
float cpu_temp(int core)
static inline float cpu_temp(int core)
{
#ifdef WIN32
return win32_cputemp(core);
@@ -97,7 +102,7 @@ float cpu_temp(int core)
#endif
}
uint32_t cpu_clock(int core)
static inline uint32_t cpu_clock(int core)
{
#ifdef WIN32
return 0;
@@ -106,7 +111,7 @@ uint32_t cpu_clock(int core)
#endif
}
int cpu_fanpercent()
static inline int cpu_fanpercent()
{
return 0;
}
@@ -142,7 +147,7 @@ static inline void cpuid(int functionnumber, int output[4]) {
#define cpuid(fn, out) out[0] = 0;
#endif
void cpu_getname(char *outbuf, size_t maxsz)
static inline void cpu_getname(char *outbuf, size_t maxsz)
{
memset(outbuf, 0, maxsz);
#ifdef WIN32
@@ -190,7 +195,7 @@ void cpu_getname(char *outbuf, size_t maxsz)
#endif
}
void cpu_getmodelid(char *outbuf, size_t maxsz)
static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
{
memset(outbuf, 0, maxsz);
#ifdef WIN32
@@ -259,32 +264,47 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
#define CPU_BRAND_2 (0x80000003)
#define CPU_BRAND_3 (0x80000004)
// Registers
#define EAX_Reg (0)
#define EBX_Reg (1)
#define ECX_Reg (2)
#define EDX_Reg (3)
#define XSAVE_Flag (1<<26) // ECX
// Feature flags
// CPU_INFO ECX
#define XSAVE_Flag (1<<26)
#define OSXSAVE_Flag (1<<27)
#define AVX1_Flag (1<<28)
#define AVX_Flag (1<<28)
#define XOP_Flag (1<<11)
#define FMA3_Flag (1<<12)
#define AES_Flag (1<<25)
#define SSE42_Flag (1<<20)
// CPU_INFO EDX
#define SSE_Flag (1<<25) // EDX
#define SSE2_Flag (1<<26)
#define AVX2_Flag (1<< 5) // ADV EBX
// EXTENDED_FEATURES EBX
#define AVX2_Flag (1<< 5)
#define AVX512F_Flag (1<<16)
#define AVX512DQ_Flag (1<<17)
#define SHA_Flag (1<<29)
#define AVX512BW_Flag (1<<30)
#define AVX512VL_Flag (1<<31)
// EXTENDED_FEATURES ECX
#define AVX512VBMI_Flag (1<<1)
#define AVX512VBMI2_Flag (1<<6)
#define AVX512VAES_Flag (1<<9)
// Use this to detect presence of feature
#define AVX1_mask (AVX1_Flag|XSAVE_Flag|OSXSAVE_Flag)
#define FMA3_mask (FMA3_Flag|AVX1_mask)
#define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag)
static inline bool has_sha_()
static inline bool has_sha()
{
#ifdef __arm__
return false;
@@ -295,10 +315,7 @@ static inline bool has_sha_()
#endif
}
bool has_sha() { return has_sha_(); }
static inline bool has_sse2_()
static inline bool has_sse2()
{
#ifdef __arm__
return false;
@@ -309,10 +326,8 @@ static inline bool has_sse2_()
#endif
}
bool has_sse2() { return has_sse2_(); }
// nehalem and above, no AVX1 on nehalem
static inline bool has_aes_ni_()
// nehalem and above, no AVX on nehalem
static inline bool has_aes_ni()
{
#ifdef __arm__
return false;
@@ -323,24 +338,20 @@ static inline bool has_aes_ni_()
#endif
}
bool has_aes_ni() { return has_aes_ni_(); }
// westmere and above
static inline bool has_avx1_()
static inline bool has_avx()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
return ( ( cpu_info[ ECX_Reg ] & AVX1_mask ) == AVX1_mask );
return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
#endif
}
bool has_avx1() { return has_avx1_(); }
// haswell and above
static inline bool has_avx2_()
static inline bool has_avx2()
{
#ifdef __arm__
return false;
@@ -351,9 +362,7 @@ static inline bool has_avx2_()
#endif
}
bool has_avx2() { return has_avx2_(); }
static inline bool has_avx512f_()
static inline bool has_avx512f()
{
#ifdef __arm__
return false;
@@ -364,24 +373,75 @@ static inline bool has_avx512f_()
#endif
}
bool has_avx512f() { return has_avx512f_(); }
static inline bool has_avx512dq()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512DQ_Flag;
#endif
}
static inline bool has_avx512bw()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512BW_Flag;
#endif
}
static inline bool has_avx512vl()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512VL_Flag;
#endif
}
// Minimum to be useful
static inline bool has_avx512()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
#endif
}
static inline bool has_avx512vaes()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VAES_Flag;
#endif
}
// AMD only
static inline bool has_xop_()
static inline bool has_xop()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
cpuid( EXTENDED_CPU_INFO, cpu_info );
return cpu_info[ ECX_Reg ] & XOP_Flag;
#endif
}
bool has_xop() { return has_xop_(); }
static inline bool has_fma3_()
static inline bool has_fma3()
{
#ifdef __arm__
return false;
@@ -392,9 +452,7 @@ static inline bool has_fma3_()
#endif
}
bool has_fma3() { return has_fma3_(); }
static inline bool has_sse42_()
static inline bool has_sse42()
{
#ifdef __arm__
return false;
@@ -405,9 +463,7 @@ static inline bool has_sse42_()
#endif
}
bool has_sse42() { return has_sse42_(); }
static inline bool has_sse_()
static inline bool has_sse()
{
#ifdef __arm__
return false;
@@ -418,16 +474,14 @@ static inline bool has_sse_()
#endif
}
bool has_sse() { return has_sse_(); }
uint32_t cpuid_get_highest_function_number()
static inline uint32_t cpuid_get_highest_function_number()
{
uint32_t cpu_info[4] = {0};
cpuid( VENDOR_ID, cpu_info);
return cpu_info[ EAX_Reg ];
}
void cpuid_get_highest_function( char* s )
static inline void cpuid_get_highest_function( char* s )
{
uint32_t fn = cpuid_get_highest_function_number();
switch (fn)
@@ -449,7 +503,7 @@ void cpuid_get_highest_function( char* s )
}
}
void cpu_bestfeature(char *outbuf, size_t maxsz)
static inline void cpu_bestfeature(char *outbuf, size_t maxsz)
{
#ifdef __arm__
sprintf(outbuf, "ARM");
@@ -459,19 +513,19 @@ void cpu_bestfeature(char *outbuf, size_t maxsz)
cpuid( CPU_INFO, cpu_info );
cpuid( EXTENDED_FEATURES, cpu_info_adv );
if ( has_avx1_() && has_avx2_() )
if ( has_avx() && has_avx2() )
sprintf(outbuf, "AVX2");
else if ( has_avx1_() )
sprintf(outbuf, "AVX1");
else if ( has_fma3_() )
else if ( has_avx() )
sprintf(outbuf, "AVX");
else if ( has_fma3() )
sprintf(outbuf, "FMA3");
else if ( has_xop_() )
else if ( has_xop() )
sprintf(outbuf, "XOP");
else if ( has_sse42_() )
else if ( has_sse42() )
sprintf(outbuf, "SSE42");
else if ( has_sse2_() )
else if ( has_sse2() )
sprintf(outbuf, "SSE2");
else if ( has_sse_() )
else if ( has_sse() )
sprintf(outbuf, "SSE");
else
*outbuf = '\0';
@@ -479,7 +533,7 @@ void cpu_bestfeature(char *outbuf, size_t maxsz)
#endif
}
void cpu_brand_string( char* s )
static inline void cpu_brand_string( char* s )
{
#ifdef __arm__
sprintf( s, "ARM" );
@@ -498,3 +552,5 @@ void cpu_brand_string( char* s )
#endif
}
#endif // SYSINFOS_C__

10
util.c
View File

@@ -24,6 +24,7 @@
#include <unistd.h>
#include <jansson.h>
#include <curl/curl.h>
#include "sysinfos.c"
#include <time.h>
#include <sys/stat.h>
#include <math.h>
@@ -1898,7 +1899,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
hex2bin(sctx->job.proofoffullnode, prooffullnode, 32);
}
sctx->bloc_height = getblocheight(sctx);
sctx->block_height = getblocheight(sctx);
for (i = 0; i < sctx->job.merkle_count; i++)
free(sctx->job.merkle[i]);
@@ -1933,13 +1934,6 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
pthread_mutex_lock(&sctx->work_lock);
sctx->next_diff = diff;
pthread_mutex_unlock(&sctx->work_lock);
/* store for api stats */
stratum_diff = diff;
if ( !opt_quiet )
applog(LOG_BLUE, "Stratum difficulty set to %g", diff);
return true;
}

View File

@@ -24,6 +24,8 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
# make release directory and copy selected DLLs.
mkdir release
cp README.txt release/
cp README.md release/
cp RELEASE_NOTES release/
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/