mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
5 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
73430b13b1 | ||
![]() |
40039386a0 | ||
![]() |
91ec6f1771 | ||
![]() |
a52c5eccf7 | ||
![]() |
86b889e1b0 |
@@ -24,18 +24,10 @@ be installed manually. There may be others, read the error messages they
|
||||
will give a clue as to the missing package.
|
||||
|
||||
The following command should install everything you need on Debian based
|
||||
distributions such as Ubuntu:
|
||||
distributions such as Ubuntu. Fedora and other distributions may have similar
|
||||
but different package names.
|
||||
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev
|
||||
|
||||
build-essential (Development Tools package group on Fedora)
|
||||
automake
|
||||
libjansson-dev
|
||||
libgmp-dev
|
||||
libcurl4-openssl-dev
|
||||
libssl-dev
|
||||
lib-thread
|
||||
zlib1g-dev
|
||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
|
||||
|
||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||
openssl 1.1.0e or higher. Add one of the following, depending on the
|
||||
|
@@ -22,14 +22,13 @@ Step by step...
|
||||
|
||||
Refer to Linux compile instructions and install required packages.
|
||||
|
||||
Additionally, install mingw-64.
|
||||
Additionally, install mingw-w64.
|
||||
|
||||
sudo apt-get install mingw-w64
|
||||
|
||||
|
||||
2. Create a local library directory for packages to be compiled in the next
|
||||
step. Recommended location is $HOME/usr/lib/
|
||||
|
||||
step. Suggested location is $HOME/usr/lib/
|
||||
|
||||
3. Download and build other packages for mingw that don't have a mingw64
|
||||
version available in the repositories.
|
||||
|
@@ -117,6 +117,7 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sse2/keccak.c \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
@@ -173,7 +174,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha256_hash_11way.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
@@ -197,9 +197,9 @@ cpuminer_SOURCES = \
|
||||
algo/skein/skein-gate.c \
|
||||
algo/skein/skein2.c \
|
||||
algo/skein/skein2-4way.c \
|
||||
algo/skein/skein2-gate.c \
|
||||
algo/sm3/sm3.c \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/swifftx/swifftx.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
@@ -279,6 +279,11 @@ cpuminer_SOURCES = \
|
||||
algo/x17/sonoa-4way.c \
|
||||
algo/x17/sonoa.c \
|
||||
algo/x20/x20r.c \
|
||||
algo/x22/x22i-4way.c \
|
||||
algo/x22/x22i.c \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
|
21
README.md
21
README.md
@@ -122,13 +122,15 @@ Supported Algorithms
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin (RVN) (original algo)
|
||||
x16rv2 Ravencoin (RVN) (new algo)
|
||||
x16r
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt_veil Veil (VEIL)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
x25x
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
@@ -136,12 +138,15 @@ Supported Algorithms
|
||||
yescryptr32 WAVI
|
||||
yespower Cryply
|
||||
yespowerr16 Yenten (YTN)
|
||||
yespoer-b2b generic yespower + blake2b
|
||||
yespower-b2b generic yespower + blake2b
|
||||
zr5 Ziftr
|
||||
|
||||
Errata
|
||||
------
|
||||
|
||||
Old algorithms that are no longer used frequently will not have the latest
|
||||
optimizations.
|
||||
|
||||
Cryptonight and variants are no longer supported, use another miner.
|
||||
|
||||
Neoscrypt crashes on Windows, use legacy version.
|
||||
@@ -160,10 +165,12 @@ Bugs
|
||||
----
|
||||
|
||||
Users are encouraged to post their bug reports using git issues or on the
|
||||
Bitcoin Talk forum at:
|
||||
Bitcoin Talk forum or opening an issue in git:
|
||||
|
||||
https://bitcointalk.org/index.php?topic=1326803.0
|
||||
|
||||
https://github.com/JayDDee/cpuminer-opt/issues
|
||||
|
||||
All problem reports must be accompanied by a proper problem definition.
|
||||
This should include how the problem occurred, the command line and
|
||||
output from the miner showing the startup messages and any errors.
|
||||
@@ -175,10 +182,6 @@ Donations
|
||||
cpuminer-opt has no fees of any kind but donations are accepted.
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
|
||||
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
|
||||
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
|
||||
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
|
||||
|
||||
Happy mining!
|
||||
|
||||
|
@@ -15,8 +15,8 @@ the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Even the newest Pentium and Celeron CPUs are often missing
|
||||
features.
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing the
|
||||
latest features.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
@@ -28,7 +28,8 @@ Exe name Compile flags Arch name
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
@@ -1,11 +1,6 @@
|
||||
cpuminer-opt is a console program run from the command line using the
|
||||
keyboard, not the mouse.
|
||||
|
||||
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
This feature requires recent SW including GCC version 5 or higher and
|
||||
openssl version 1.1 or higher. It may also require using "-march=znver1"
|
||||
compile flag.
|
||||
|
||||
Security warning
|
||||
----------------
|
||||
|
||||
@@ -36,17 +31,67 @@ FreeBSD YMMV.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.10.1
|
||||
|
||||
AVX512 for blake2b, nist5, quark, tribus.
|
||||
|
||||
More broken lane fixes.
|
||||
|
||||
Fixed buffer overflow in skein AVX512.
|
||||
|
||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||
features are available but no longer listed.
|
||||
|
||||
v3.10.0
|
||||
|
||||
AVX512 is now supported on selected algos, Windows binary is now available.
|
||||
AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
|
||||
skein & skein2.
|
||||
|
||||
Fixed CPU temperature for some CPU models (Linux only).
|
||||
|
||||
Fixed a bug that caused some lanes not to submit shares.
|
||||
|
||||
Fixed some previously undetected buffer overflows.
|
||||
|
||||
Lyra2rev2 3% faster SSE2 and AVX2.
|
||||
|
||||
Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
|
||||
to fix known mingw issue.
|
||||
|
||||
Changed AVX2 build script to explicitly add AES to address change in
|
||||
behaviour in GCC 9.
|
||||
|
||||
v3.9.11
|
||||
|
||||
Added x22i & x25x algos.
|
||||
Blake2s 2% faster AVX2 with Intel CPU, slower with Ryzen v1, v2 ?
|
||||
|
||||
v3.9.10
|
||||
|
||||
Faster X* algos with AVX2.
|
||||
Small improvements to summary stats report.
|
||||
|
||||
v3.9.9.1
|
||||
|
||||
Fixed a day1 bug that could cause the miner to idle for up to 2 minutes
|
||||
under certain circumstances.
|
||||
|
||||
Redesigned summary stats report now includes session statistics.
|
||||
|
||||
More robust handling of statistics to reduce corruption.
|
||||
|
||||
Removed --hide-diff option.
|
||||
|
||||
Better handling of cpu-affinity with more than 64 CPUs.
|
||||
|
||||
v3.9.9
|
||||
|
||||
Added power2b algo for MicroBitcoin.
|
||||
|
||||
Added generic yespower-b2b (yespower + blake2b) algo to be used with
|
||||
the parameters introduced in v3.9.7 for yespower & yescrypt.
|
||||
|
||||
Display additional info when a share is rejected.
|
||||
|
||||
Some low level enhancements and minor tweaking of log output.
|
||||
|
||||
RELEASE_NOTES (this file) and README.md added to Windows release package.
|
||||
|
||||
v3.9.8.1
|
||||
|
@@ -116,8 +116,6 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->get_nonceptr = (void*)&std_get_nonceptr;
|
||||
gate->work_decode = (void*)&std_le_work_decode;
|
||||
gate->decode_extra_data = (void*)&do_nothing;
|
||||
gate->wait_for_diff = (void*)&std_wait_for_diff;
|
||||
gate->get_max64 = (void*)&get_max64_0x1fffffLL;
|
||||
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
|
||||
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
|
||||
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
|
||||
@@ -240,6 +238,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
/* case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
|
||||
@@ -278,7 +278,7 @@ bool register_json_rpc2( algo_gate_t *gate )
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
|
||||
gate->wait_for_diff = (void*)&do_nothing;
|
||||
// gate->wait_for_diff = (void*)&do_nothing;
|
||||
gate->get_new_work = (void*)&jr2_get_new_work;
|
||||
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
|
||||
gate->stratum_gen_work = (void*)&jr2_stratum_gen_work;
|
||||
|
@@ -35,7 +35,7 @@
|
||||
// 6. Determine if other non existant functions are required.
|
||||
// That is determined by the need to add code in cpu-miner.c
|
||||
// that applies only to the new algo. That is forbidden. All
|
||||
// algo specific code must be in theh algo's file.
|
||||
// algo specific code must be in the algo's file.
|
||||
//
|
||||
// 7. If new functions need to be added to the gate add the type
|
||||
// to the structure, declare a null instance in this file and define
|
||||
@@ -48,10 +48,10 @@
|
||||
// instances as they are defined by default, or unsafe functions that
|
||||
// are not needed by the algo.
|
||||
//
|
||||
// 9. Add an case entry to the switch/case in function register_gate
|
||||
// 9. Add a case entry to the switch/case in function register_gate
|
||||
// in file algo-gate-api.c for the new algo.
|
||||
//
|
||||
// 10 If a new function type was defined add an entry to ini talgo_gate
|
||||
// 10 If a new function type was defined add an entry to init algo_gate
|
||||
// to initialize the new function to its null instance described in step 7.
|
||||
//
|
||||
// 11. If the new algo has aliases add them to the alias array in
|
||||
@@ -110,14 +110,7 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// special case, only one target, provides a callback for scanhash to
|
||||
// submit work with less overhead.
|
||||
// bool (*submit_work ) ( struct thr_info*, const struct work* );
|
||||
|
||||
// mandatory functions, must be overwritten
|
||||
// Added a 5th arg for the thread_info structure to replace the int thr id
|
||||
// in the first arg. Both will co-exist during the trasition.
|
||||
//int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
// optional unsafe, must be overwritten if algo uses function
|
||||
@@ -125,27 +118,55 @@ void ( *hash ) ( void*, const void*, uint32_t ) ;
|
||||
void ( *hash_suw ) ( void*, const void* );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Generate global blockheader from stratum data.
|
||||
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
|
||||
bool );
|
||||
|
||||
// Return pointer to nonce in blockheader.
|
||||
uint32_t *( *get_nonceptr ) ( uint32_t* );
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
void ( *wait_for_diff ) ( struct stratum_ctx* );
|
||||
int64_t ( *get_max64 ) ();
|
||||
|
||||
// Decode getwork blockheader
|
||||
bool ( *work_decode ) ( const json_t*, struct work* );
|
||||
|
||||
// Extra getwork data
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
|
||||
// Increment extranonce
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
// Build mining.submit message
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
bool ( *stratum_handle_response )( json_t* );
|
||||
set_t optimizations;
|
||||
@@ -200,8 +221,6 @@ void null_hash_suw();
|
||||
|
||||
// optional safe targets, default listed first unless noted.
|
||||
|
||||
void std_wait_for_diff();
|
||||
|
||||
uint32_t *std_get_nonceptr( uint32_t *work_data );
|
||||
uint32_t *jr2_get_nonceptr( uint32_t *work_data );
|
||||
|
||||
@@ -216,21 +235,13 @@ void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
// pick your favorite or define your own
|
||||
int64_t get_max64_0x1fffffLL(); // default
|
||||
int64_t get_max64_0x40LL();
|
||||
int64_t get_max64_0x3ffff();
|
||||
int64_t get_max64_0x3fffffLL();
|
||||
int64_t get_max64_0x1ffff();
|
||||
int64_t get_max64_0xffffLL();
|
||||
|
||||
bool std_le_work_decode( const json_t *val, struct work *work );
|
||||
bool std_be_work_decode( const json_t *val, struct work *work );
|
||||
bool jr2_work_decode( const json_t *val, struct work *work );
|
||||
bool jr2_work_decode( const json_t *val, struct work *work );
|
||||
|
||||
bool std_le_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool std_be_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
|
||||
bool jr2_submit_getwork_result( CURL *curl, struct work *work );
|
||||
|
||||
void std_le_build_stratum_request( char *req, struct work *work );
|
||||
void std_be_build_stratum_request( char *req, struct work *work );
|
||||
@@ -244,8 +255,8 @@ void set_work_data_big_endian( struct work *work );
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
|
||||
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
@@ -266,8 +277,8 @@ int std_get_work_data_size();
|
||||
// by calling the algo's register function.
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate );
|
||||
|
||||
// Override any default gate functions that are applicable and do any other
|
||||
// algo-specific initialization.
|
||||
// Called by algos toverride any default gate functions that are applicable
|
||||
// and do any other algo-specific initialization.
|
||||
// The register functions for all the algos can be declared here to reduce
|
||||
// compiler warnings but that's just more work for devs adding new algos.
|
||||
bool register_algo( algo_gate_t *gate );
|
||||
@@ -280,5 +291,7 @@ bool register_json_rpc2( algo_gate_t *gate );
|
||||
// use this to call the hash function of an algo directly, ie util.c test.
|
||||
void exec_hash_function( int algo, void *output, const void *pdata );
|
||||
|
||||
void get_algo_alias( char** algo_or_alias );
|
||||
// Validate a string as a known algo and alias, updates arg to proper
|
||||
// algo name if valid alias, NULL if invalid alias or algo.
|
||||
void get_algo_alias( char **algo_or_alias );
|
||||
|
||||
|
@@ -74,18 +74,12 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t argon2_get_max64 ()
|
||||
{
|
||||
return 0x1ffLL;
|
||||
}
|
||||
|
||||
bool register_argon2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->get_max64 = (void*)&argon2_get_max64;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
|
@@ -179,12 +179,9 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_max64_0x1ff() { return 0x1ff; }
|
||||
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ff;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
|
@@ -21,7 +21,7 @@
|
||||
|
||||
#include "argon2.h"
|
||||
#include "core.h"
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blamka-round-opt.h"
|
||||
|
||||
@@ -37,24 +37,28 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static void fill_block(__m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
__m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
block_XY[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
block_XY[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)next_block->v + i ) );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
|
||||
state[ 4], state[ 5], state[ 6], state[ 7] );
|
||||
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
|
||||
state[ 9], state[11], state[13], state[15] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_1(
|
||||
state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
|
||||
state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_2(
|
||||
state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
|
||||
state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(state[i], block_XY[i]);
|
||||
_mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i], block_XY[i] );
|
||||
_mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
|
||||
state[19], state[23], state[27], state[31] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
|
||||
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
|
||||
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
|
||||
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
|
||||
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
block_XY[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
|
||||
state[39], state[47], state[55], state[63] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
|
||||
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define UNSWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
} while((void)0, 0)
|
||||
|
||||
|
@@ -1,18 +1,8 @@
|
||||
#include "blake-gate.h"
|
||||
|
||||
int64_t blake_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blake_get_max64;
|
||||
//#if defined (__AVX2__) && defined (FOUR_WAY)
|
||||
// gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_blake_8way;
|
||||
// gate->hash = (void*)&blakehash_8way;
|
||||
#if defined(BLAKE_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
|
@@ -59,7 +59,6 @@ extern "C"{
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
uint32_t S[4<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
@@ -93,7 +92,6 @@ void blake256r8_4way_close(void *cc, void *dst);
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
@@ -120,20 +118,42 @@ void blake256r8_8way_close(void *cc, void *dst);
|
||||
// Blake-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_4way_big_context;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init(void *cc);
|
||||
void blake512_4way(void *cc, const void *data, size_t len);
|
||||
void blake512_4way_close(void *cc, void *dst);
|
||||
void blake512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
void blake512_4way_init( void *cc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
#define blake512_4way blake512_4way_update
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( void *cc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
|
@@ -304,16 +304,17 @@ static const sph_u32 CS[16] = {
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
|
||||
_mm_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
@@ -321,7 +322,8 @@ do { \
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
// Blake-256 4 way
|
||||
// Not used
|
||||
#if 0
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
@@ -342,6 +344,8 @@ do { \
|
||||
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
@@ -359,7 +363,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m128i S0, S1, S2, S3; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
@@ -371,10 +374,6 @@ do { \
|
||||
H5 = casti_m128i( state->H, 5 ); \
|
||||
H6 = casti_m128i( state->H, 6 ); \
|
||||
H7 = casti_m128i( state->H, 7 ); \
|
||||
S0 = casti_m128i( state->S, 0 ); \
|
||||
S1 = casti_m128i( state->S, 1 ); \
|
||||
S2 = casti_m128i( state->S, 2 ); \
|
||||
S3 = casti_m128i( state->S, 3 ); \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -388,17 +387,13 @@ do { \
|
||||
casti_m128i( state->H, 5 ) = H5; \
|
||||
casti_m128i( state->H, 6 ) = H6; \
|
||||
casti_m128i( state->H, 7 ) = H7; \
|
||||
casti_m128i( state->S, 0 ) = S0; \
|
||||
casti_m128i( state->S, 1 ) = S1; \
|
||||
casti_m128i( state->S, 2 ) = S2; \
|
||||
casti_m128i( state->S, 3 ) = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
// not used
|
||||
|
||||
#if 0
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -441,6 +436,7 @@ do { \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@@ -508,10 +504,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
|
||||
V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
|
||||
VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
|
||||
VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
|
||||
V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m128_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m128_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m128_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
m128_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
|
||||
@@ -538,14 +534,14 @@ do { \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = mm128_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm128_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm128_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm128_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm128_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm128_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm128_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm128_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
@@ -556,13 +552,13 @@ do { \
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
||||
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
@@ -581,7 +577,6 @@ do { \
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u32 T0, T1;
|
||||
|
||||
#define READ_STATE32_8WAY(state) \
|
||||
@@ -594,10 +589,6 @@ do { \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -612,10 +603,6 @@ do { \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -635,10 +622,10 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
|
||||
VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
|
||||
VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
|
||||
V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m256_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m256_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m256_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
m256_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
|
||||
@@ -682,14 +669,14 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -703,7 +690,6 @@ static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
__m128i zero = m128_zero;
|
||||
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -712,11 +698,6 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
casti_m128i( ctx->S, 0 ) = zero;
|
||||
casti_m128i( ctx->S, 1 ) = zero;
|
||||
casti_m128i( ctx->S, 2 ) = zero;
|
||||
casti_m128i( ctx->S, 3 ) = zero;
|
||||
ctx->T0 = ctx->T1 = 0;
|
||||
ctx->ptr = 0;
|
||||
ctx->rounds = rounds;
|
||||
@@ -824,7 +805,6 @@ static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
@@ -833,10 +813,6 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
|
@@ -4,13 +4,59 @@
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
|
@@ -1,24 +1,19 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
// gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,13 +4,21 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_4WAY)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -33,6 +33,178 @@
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
static const uint8_t sigma[12][16] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm512_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
v[ 1] = ctx->h[1];
|
||||
v[ 2] = ctx->h[2];
|
||||
v[ 3] = ctx->h[3];
|
||||
v[ 4] = ctx->h[4];
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
|
||||
m[ 0] = ctx->b[ 0];
|
||||
m[ 1] = ctx->b[ 1];
|
||||
m[ 2] = ctx->b[ 2];
|
||||
m[ 3] = ctx->b[ 3];
|
||||
m[ 4] = ctx->b[ 4];
|
||||
m[ 5] = ctx->b[ 5];
|
||||
m[ 6] = ctx->b[ 6];
|
||||
m[ 7] = ctx->b[ 7];
|
||||
m[ 8] = ctx->b[ 8];
|
||||
m[ 9] = ctx->b[ 9];
|
||||
m[10] = ctx->b[10];
|
||||
m[11] = ctx->b[11];
|
||||
m[12] = ctx->b[12];
|
||||
m[13] = ctx->b[13];
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
|
||||
B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
|
||||
B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
|
||||
B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
|
||||
B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
ctx->c = 0;
|
||||
ctx->outlen = 32;
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
ctx->b[i] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
|
||||
size_t i, c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
for ( i = 0; i < (inlen >> 3); i++ )
|
||||
{
|
||||
if ( ctx->c == 128 )
|
||||
{
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
ctx->c += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
|
||||
while ( ctx->c < 128 )
|
||||
{
|
||||
ctx->b[c++] = m512_zero;
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
casti_m512i( out, 2 ) = ctx->h[2];
|
||||
casti_m512i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// G Mixing function.
|
||||
@@ -61,21 +233,6 @@ static const uint64_t blake2b_iv[8] = {
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
{
|
||||
const uint8_t sigma[12][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
|
||||
};
|
||||
int i;
|
||||
__m256i v[16], m[16];
|
||||
|
||||
v[ 0] = ctx->h[0];
|
||||
@@ -118,7 +275,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
m[14] = ctx->b[14];
|
||||
m[15] = ctx->b[15];
|
||||
|
||||
for ( i = 0; i < 12; i++ )
|
||||
for ( int i = 0; i < 12; i++ )
|
||||
{
|
||||
B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
|
||||
B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
|
||||
|
@@ -2,8 +2,6 @@
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
@@ -16,14 +14,34 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(64) typedef struct {
|
||||
ALIGN(128) typedef struct {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx __attribute__((aligned(64)));
|
||||
} blake2b_4way_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
|
@@ -3,22 +3,72 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_8x32( output, output+ 32, output+ 64, output+ 96,
|
||||
output+128, output+160, output+192, output+224,
|
||||
vhash, 256 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -26,13 +76,15 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -45,16 +97,17 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
@@ -67,15 +120,10 @@ static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
|
||||
|
||||
dintrlv_4x32( output, output+32, output+64, output+96,
|
||||
vhash, 256 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -83,13 +131,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -101,15 +151,16 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
@@ -1,15 +1,12 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
@@ -19,8 +16,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -8,13 +8,26 @@
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_8WAY)
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -20,12 +20,13 @@
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
|
||||
/*
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
{
|
||||
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
|
||||
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint8_t blake2s_sigma[10][16] =
|
||||
{
|
||||
@@ -41,6 +42,7 @@ static const uint8_t blake2s_sigma[10][16] =
|
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
|
||||
};
|
||||
|
||||
|
||||
// define a constant for initial param.
|
||||
|
||||
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
@@ -88,41 +90,45 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm_set1_epi32( blake2s_IV[3] );
|
||||
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[4] ) );
|
||||
m128_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[5] ) );
|
||||
m128_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
_mm_set1_epi32( blake2s_IV[6] ) );
|
||||
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
_mm_set1_epi32( blake2s_IV[7] ) );
|
||||
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W(r,i,a,b,c,d) \
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define ROUND4W(r) \
|
||||
do { \
|
||||
G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G4W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G4W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G4W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G4W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G4W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND4W( 0 );
|
||||
@@ -144,26 +150,47 @@ do { \
|
||||
return 0;
|
||||
}
|
||||
|
||||
// There is a problem that can't be resolved internally.
|
||||
// If the last block is a full 64 bytes it should not be compressed in
|
||||
// update but left for final. However, when streaming, it isn't known
|
||||
// which block is last. There may be a subsequent call to update to add
|
||||
// more data.
|
||||
//
|
||||
// The reference code handled this by juggling 2 blocks at a time at
|
||||
// a significant performance penalty.
|
||||
//
|
||||
// Instead a new function is introduced called full_blocks which combines
|
||||
// update and final and is to be used in non-streaming mode where the data
|
||||
// is a multiple of 64 bytes.
|
||||
//
|
||||
// Supported:
|
||||
// 64 + 16 bytes (blake2s with midstate optimization)
|
||||
// 80 bytes (blake2s without midstate optimization)
|
||||
// Any multiple of 64 bytes in one shot (x25x)
|
||||
//
|
||||
// Unsupported:
|
||||
// Stream of full 64 byte blocks one at a time.
|
||||
|
||||
// use only when streaming more data or final block not full.
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
__m128i *input = (__m128i*)in;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
if( inlen >= BLAKE2S_BLOCKBYTES - left )
|
||||
{
|
||||
memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
|
||||
S->buflen += BLAKE2S_BLOCKBYTES - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
input += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -195,8 +222,45 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m128i *in = (__m128i*)input;
|
||||
__m128i *buf = (__m128i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_4way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_4way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m128i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// The commented code below is slower on Intel but faster on
|
||||
// Zen1 AVX2. It's also faster than Zen1 AVX.
|
||||
// Ryzen gen2 is unknown at this time.
|
||||
|
||||
int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
{
|
||||
__m256i m[16];
|
||||
@@ -205,6 +269,23 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
m256_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
m256_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
@@ -218,6 +299,7 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
@@ -231,7 +313,36 @@ do { \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define G8W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G8W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G8W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G8W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G8W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
@@ -243,6 +354,7 @@ do { \
|
||||
G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
ROUND8W( 0 );
|
||||
ROUND8W( 1 );
|
||||
@@ -354,6 +466,168 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
{
|
||||
__m512i m[16];
|
||||
__m512i v[16];
|
||||
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G16W( sigma[ 6], sigma[ 7], v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G16W( sigma[ 8], sigma[ 9], v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G16W( sigma[10], sigma[11], v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G16W( sigma[12], sigma[13], v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G16W( sigma[14], sigma[15], v[ 3], v[ 4], v[ 9], v[14] ); \
|
||||
} while(0)
|
||||
|
||||
ROUND16W( 0 );
|
||||
ROUND16W( 1 );
|
||||
ROUND16W( 2 );
|
||||
ROUND16W( 3 );
|
||||
ROUND16W( 4 );
|
||||
ROUND16W( 5 );
|
||||
ROUND16W( 6 );
|
||||
ROUND16W( 7 );
|
||||
ROUND16W( 8 );
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
{
|
||||
blake2s_nway_param P[1];
|
||||
|
||||
P->digest_length = outlen;
|
||||
P->key_length = 0;
|
||||
P->fanout = 1;
|
||||
P->depth = 1;
|
||||
P->leaf_length = 0;
|
||||
*((uint64_t*)(P->node_offset)) = 0;
|
||||
P->node_depth = 0;
|
||||
P->inner_length = 0;
|
||||
memset( P->salt, 0, sizeof( P->salt ) );
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen )
|
||||
{
|
||||
__m512i *input = (__m512i*)in;
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
const int bsize = BLAKE2S_BLOCKBYTES;
|
||||
|
||||
while( inlen > 0 )
|
||||
{
|
||||
size_t left = S->buflen;
|
||||
if( inlen >= bsize - left )
|
||||
{
|
||||
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
|
||||
S->buflen += bsize - left;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_16way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
input += ( bsize >> 2 );
|
||||
inlen -= bsize;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_512( buf + ( left>>2 ), input, inlen>>2 );
|
||||
S->buflen += (size_t) inlen;
|
||||
input += ( inlen>>2 );
|
||||
inlen -= inlen;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
|
||||
{
|
||||
__m512i *buf = (__m512i*)S->buf;
|
||||
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node )
|
||||
S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
|
||||
memset_zero_512( buf + ( S->buflen>>2 ),
|
||||
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
|
||||
blake2s_16way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m512i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#if 0
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
|
@@ -75,6 +75,9 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
|
||||
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -92,6 +95,27 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
|
||||
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
// const void *input, uint64_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
uint8_t last_node;
|
||||
} blake2s_16way_state ;
|
||||
|
||||
int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen );
|
||||
int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -70,18 +70,3 @@ int scanhash_blake2s( struct work *work,
|
||||
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blake2s_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
gate->get_max64 = (void*)&blake2s_get_max64;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
@@ -54,7 +54,6 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512
|
||||
|
||||
static const sph_u64 IV512[8] = {
|
||||
@@ -64,6 +63,7 @@ static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
|
||||
@@ -264,8 +264,6 @@ static const unsigned sigma[16][16] = {
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
@@ -287,6 +285,7 @@ static const unsigned sigma[16][16] = {
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
/*
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
static const sph_u64 CB[16] = {
|
||||
@@ -301,7 +300,301 @@ static const sph_u64 CB[16] = {
|
||||
};
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake-512 8 way
|
||||
|
||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B_8WAY(r) do { \
|
||||
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY do \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB4 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB5 ) ); \
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB6 ) ); \
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
ROUND_B_8WAY(6); \
|
||||
ROUND_B_8WAY(7); \
|
||||
ROUND_B_8WAY(8); \
|
||||
ROUND_B_8WAY(9); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = zero;
|
||||
casti_m512i( sc->S, 1 ) = zero;
|
||||
casti_m512i( sc->S, 2 ) = zero;
|
||||
casti_m512i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_8WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_8WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way_close( blake_8way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
|
||||
m512_const1_64( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_init(void *cc)
|
||||
{
|
||||
blake64_8way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_8way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_8way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
@@ -318,29 +611,6 @@ static const sph_u64 CB[16] = {
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
//current_impl
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
@@ -352,118 +622,11 @@ static const sph_u64 CB[16] = {
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define READ_STATE64_4WAY(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64_4WAY(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
|
||||
// not used
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M[16]; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_bswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_bswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_bswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_bswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_bswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_bswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_bswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_bswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_bswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_bswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_bswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_bswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_bswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_bswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_bswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_bswap_64( *(buf+15) ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
//current impl
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
@@ -491,7 +654,7 @@ static const sph_u64 CB[16] = {
|
||||
m256_const1_64( CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
m256_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
@@ -535,9 +698,7 @@ static const sph_u64 CB[16] = {
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
@@ -582,7 +743,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64_4WAY(sc);
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
@@ -602,7 +763,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64_4WAY(sc);
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
@@ -620,7 +781,7 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
|
||||
buf[ptr>>3] = _mm256_set1_epi64x( zz );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -673,7 +834,7 @@ blake512_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way(void *cc, const void *data, size_t len)
|
||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_4way(cc, data, len);
|
||||
}
|
||||
|
@@ -1,13 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
// return 0x3fffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
@@ -23,7 +16,6 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -93,33 +93,3 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
|
||||
}
|
||||
*/
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
blakecoin_init( &blake_init_ctx );
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
@@ -38,7 +38,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if ( net_diff > 0. )
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -154,7 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
@@ -143,7 +143,7 @@ void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
|
||||
if (!have_longpoll && work->height > *net_blocks + 1)
|
||||
{
|
||||
char netinfo[64] = { 0 };
|
||||
if (opt_showdiff && net_diff > 0.)
|
||||
if (net_diff > 0.)
|
||||
{
|
||||
if (net_diff != work->targetdiff)
|
||||
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
|
||||
@@ -269,7 +269,6 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_decred;
|
||||
gate->hash = (void*)&decred_hash;
|
||||
gate->get_nonceptr = (void*)&decred_get_nonceptr;
|
||||
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
|
||||
gate->decode_extra_data = (void*)&decred_decode_extradata;
|
||||
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
@@ -10,7 +10,6 @@ bool register_pentablake_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&pentablakehash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -78,7 +78,7 @@ void bmw256_4way_addbits_and_close(
|
||||
// BMW-256 8 way 32
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[64];
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
@@ -107,7 +107,8 @@ typedef struct {
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
|
||||
void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
@@ -121,14 +122,15 @@ typedef struct {
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
} bmw_4way_big_context;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
void bmw512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw512_4way bmw512_4way_update
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -137,6 +139,22 @@ void bmw512_4way_addbits_and_close(
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw512_8way_context __attribute__((aligned(128)));
|
||||
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx );
|
||||
void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -137,165 +137,151 @@ static const uint32_t IV256[] = {
|
||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
// Expressions are grouped using associativity to reduce CPU depenedencies,
|
||||
// resulting in some sign changes compared to the reference code.
|
||||
|
||||
#define Ws0 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws1 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws2 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws3 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws4 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws5 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws6 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws7 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws8 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define Ws9 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
#define Ws9 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws10 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws11 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define Ws12 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ) )
|
||||
|
||||
#define Ws13 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) )
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ) )
|
||||
|
||||
#define Ws14 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) )
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ) )
|
||||
|
||||
#define Ws15 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
@@ -700,163 +686,148 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#define W8s0 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define W8s1 \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[14], H[14] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define W8s2 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define W8s3 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
#define W8s4 \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define W8s5 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define W8s6 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
#define W8s7 \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define W8s8 \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define W8s9 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define W8s10 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define W8s11 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define W8s12 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
||||
|
||||
#define W8s13 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) )
|
||||
_mm256_add_epi32( \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
||||
|
||||
#define W8s14 \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) )
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_add_epi32( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
||||
|
||||
#define W8s15 \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi32( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
__m256i dH[16] )
|
||||
|
@@ -1,13 +1,66 @@
|
||||
#include "bmw512-gate.h"
|
||||
|
||||
#ifdef BMW512_4WAY
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph_keccak.h"
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way(void *state, const void *input)
|
||||
{
|
||||
bmw512_8way_context ctx;
|
||||
bmw512_8way_init( &ctx );
|
||||
bmw512_8way_update( &ctx, input, 80 );
|
||||
bmw512_8way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
// const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0 ,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
|
||||
void bmw512hash_4way(void *state, const void *input)
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
|
@@ -1,13 +1,13 @@
|
||||
#include "bmw512-gate.h"
|
||||
|
||||
int64_t bmw512_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_bmw512_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->get_max64 = (void*)&bmw512_get_max64;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (BMW512_4WAY)
|
||||
#if defined (BMW512_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||
gate->hash = (void*)&bmw512hash_8way;
|
||||
#elif defined (BMW512_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||
gate->hash = (void*)&bmw512hash_4way;
|
||||
#else
|
||||
|
@@ -1,23 +1,33 @@
|
||||
#ifndef BMW512_GATE_H__
|
||||
#define BMW512_GATE_H__
|
||||
#define BMW512_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(BMW512_4WAY)
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way( void *state, const void *input );
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
void bmw512hash_4way( void *state, const void *input );
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void bmw512hash( void *state, const void *input );
|
||||
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -556,18 +556,15 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
compress_big_2way( buf, h, h2 );
|
||||
memcpy_128( buf, h2, 16 );
|
||||
compress_big_2way( buf, final_b2, h1 );
|
||||
memcpy( (__m128i*)dst, h1+16, 8 );
|
||||
memcpy( (__m128i*)dst, h1+8, 8 );
|
||||
}
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 4 way 64
|
||||
|
||||
|
||||
#define sb0(x) \
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
|
||||
mm256_rol_64( (x), 4), mm256_rol_64( (x),37) )
|
||||
@@ -636,165 +633,152 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define Wb1 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[14], H[14] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define Wb2 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define Wb3 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
#define Wb4 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define Wb5 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define Wb6 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
#define Wb7 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define Wb8 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[13], H[13] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define Wb9 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ) )
|
||||
|
||||
#define Wb10 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 7], H[ 7] ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) ) )
|
||||
|
||||
#define Wb11 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define Wb12 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ) )
|
||||
|
||||
#define Wb13 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) )
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[10], H[10] ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ) )
|
||||
|
||||
#define Wb14 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) )
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[11], H[11] ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ) )
|
||||
|
||||
#define Wb15 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 9], H[ 9] ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
@@ -1060,7 +1044,7 @@ bmw512_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_4way(void *cc, const void *data, size_t len)
|
||||
bmw512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
bmw64_4way(cc, data, len);
|
||||
}
|
||||
@@ -1079,6 +1063,483 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-512 8 WAY
|
||||
|
||||
#define s8b0(x) \
|
||||
mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 3), \
|
||||
mm512_rol_64( (x), 4), mm512_rol_64( (x),37) )
|
||||
|
||||
#define s8b1(x) \
|
||||
mm512_xor4( _mm512_srli_epi64( (x), 1), _mm512_slli_epi64( (x), 2), \
|
||||
mm512_rol_64( (x),13), mm512_rol_64( (x),43) )
|
||||
|
||||
#define s8b2(x) \
|
||||
mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 1), \
|
||||
mm512_rol_64( (x),19), mm512_rol_64( (x),53) )
|
||||
|
||||
#define s8b3(x) \
|
||||
mm512_xor4( _mm512_srli_epi64( (x), 2), _mm512_slli_epi64( (x), 2), \
|
||||
mm512_rol_64( (x),28), mm512_rol_64( (x),59) )
|
||||
|
||||
#define s8b4(x) \
|
||||
_mm512_xor_si512( (x), _mm512_srli_epi64( (x), 1 ) )
|
||||
|
||||
#define s8b5(x) \
|
||||
_mm512_xor_si512( (x), _mm512_srli_epi64( (x), 2 ) )
|
||||
|
||||
#define r8b1(x) mm512_rol_64( x, 5 )
|
||||
#define r8b2(x) mm512_rol_64( x, 11 )
|
||||
#define r8b3(x) mm512_rol_64( x, 27 )
|
||||
#define r8b4(x) mm512_rol_64( x, 32 )
|
||||
#define r8b5(x) mm512_rol_64( x, 37 )
|
||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||
|
||||
#define rol8w_off_64( M, j, off ) \
|
||||
mm512_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b8( M, H, j ) \
|
||||
_mm512_xor_si512( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( rol8w_off_64( M, j, 0 ), \
|
||||
rol8w_off_64( M, j, 3 ) ), \
|
||||
rol8w_off_64( M, j, 10 ) ), \
|
||||
_mm512_set1_epi64( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
#define expand1b8( qt, M, H, i ) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
mm512_add4_64( s8b1( qt[ (i)-16 ] ), s8b2( qt[ (i)-15 ] ), \
|
||||
s8b3( qt[ (i)-14 ] ), s8b0( qt[ (i)-13 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)-12 ] ), s8b2( qt[ (i)-11 ] ), \
|
||||
s8b3( qt[ (i)-10 ] ), s8b0( qt[ (i)- 9 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)- 8 ] ), s8b2( qt[ (i)- 7 ] ), \
|
||||
s8b3( qt[ (i)- 6 ] ), s8b0( qt[ (i)- 5 ] )), \
|
||||
mm512_add4_64( s8b1( qt[ (i)- 4 ] ), s8b2( qt[ (i)- 3 ] ), \
|
||||
s8b3( qt[ (i)- 2 ] ), s8b0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2b8( qt, M, H, i) \
|
||||
_mm512_add_epi64( mm512_add4_64( \
|
||||
mm512_add4_64( qt[ (i)-16 ], r8b1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r8b2( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)-12 ], r8b3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], r8b4( qt[ (i)- 9 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)- 8 ], r8b5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r8b6( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_64( qt[ (i)- 4 ], r8b7( qt[ (i)- 3 ] ), \
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
|
||||
#define W8b0 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W8b1 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[14], H[14] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W8b2 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W8b3 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
#define W8b4 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W8b5 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W8b6 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
#define W8b7 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W8b8 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W8b9 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W8b10 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W8b11 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define W8b12 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||
|
||||
#define W8b13 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||
|
||||
#define W8b14 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_add_epi64( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||
|
||||
#define W8b15 \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi64( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
__m512i dH[16] )
|
||||
{
|
||||
__m512i qt[32], xl, xh;
|
||||
|
||||
qt[ 0] = _mm512_add_epi64( s8b0( W8b0 ), H[ 1] );
|
||||
qt[ 1] = _mm512_add_epi64( s8b1( W8b1 ), H[ 2] );
|
||||
qt[ 2] = _mm512_add_epi64( s8b2( W8b2 ), H[ 3] );
|
||||
qt[ 3] = _mm512_add_epi64( s8b3( W8b3 ), H[ 4] );
|
||||
qt[ 4] = _mm512_add_epi64( s8b4( W8b4 ), H[ 5] );
|
||||
qt[ 5] = _mm512_add_epi64( s8b0( W8b5 ), H[ 6] );
|
||||
qt[ 6] = _mm512_add_epi64( s8b1( W8b6 ), H[ 7] );
|
||||
qt[ 7] = _mm512_add_epi64( s8b2( W8b7 ), H[ 8] );
|
||||
qt[ 8] = _mm512_add_epi64( s8b3( W8b8 ), H[ 9] );
|
||||
qt[ 9] = _mm512_add_epi64( s8b4( W8b9 ), H[10] );
|
||||
qt[10] = _mm512_add_epi64( s8b0( W8b10), H[11] );
|
||||
qt[11] = _mm512_add_epi64( s8b1( W8b11), H[12] );
|
||||
qt[12] = _mm512_add_epi64( s8b2( W8b12), H[13] );
|
||||
qt[13] = _mm512_add_epi64( s8b3( W8b13), H[14] );
|
||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
qt[16] = expand1b8( qt, M, H, 16 );
|
||||
qt[17] = expand1b8( qt, M, H, 17 );
|
||||
qt[18] = expand2b8( qt, M, H, 18 );
|
||||
qt[19] = expand2b8( qt, M, H, 19 );
|
||||
qt[20] = expand2b8( qt, M, H, 20 );
|
||||
qt[21] = expand2b8( qt, M, H, 21 );
|
||||
qt[22] = expand2b8( qt, M, H, 22 );
|
||||
qt[23] = expand2b8( qt, M, H, 23 );
|
||||
qt[24] = expand2b8( qt, M, H, 24 );
|
||||
qt[25] = expand2b8( qt, M, H, 25 );
|
||||
qt[26] = expand2b8( qt, M, H, 26 );
|
||||
qt[27] = expand2b8( qt, M, H, 27 );
|
||||
qt[28] = expand2b8( qt, M, H, 28 );
|
||||
qt[29] = expand2b8( qt, M, H, 29 );
|
||||
qt[30] = expand2b8( qt, M, H, 30 );
|
||||
qt[31] = expand2b8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
}
|
||||
|
||||
static const __m512i final_b8[16] =
|
||||
{
|
||||
{ 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
|
||||
0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
|
||||
0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
|
||||
0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
|
||||
{ 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
|
||||
0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
|
||||
0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
|
||||
0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
|
||||
{ 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
|
||||
0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
|
||||
0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
|
||||
0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
|
||||
{ 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
|
||||
0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
|
||||
0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
|
||||
0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
|
||||
{ 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
|
||||
0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
|
||||
0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
|
||||
0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
|
||||
{ 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
|
||||
0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
|
||||
0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
|
||||
0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
|
||||
{ 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
|
||||
0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
|
||||
0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
|
||||
0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
|
||||
{ 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
|
||||
0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
|
||||
0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
|
||||
0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
|
||||
{ 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
|
||||
0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
|
||||
0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
|
||||
0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
|
||||
{ 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
|
||||
0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
|
||||
0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
|
||||
0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
|
||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||
{ 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
|
||||
0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
|
||||
0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
|
||||
0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
|
||||
{ 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
|
||||
0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
|
||||
0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
|
||||
0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
|
||||
{ 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
|
||||
0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
|
||||
0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
|
||||
0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
|
||||
{ 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
|
||||
0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
|
||||
0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
|
||||
0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
|
||||
{ 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
|
||||
0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
|
||||
0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
|
||||
0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
|
||||
};
|
||||
|
||||
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx )
|
||||
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
{
|
||||
ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
|
||||
ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
|
||||
ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
|
||||
ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
|
||||
ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
|
||||
ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
|
||||
ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
|
||||
ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
|
||||
ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
|
||||
ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
|
||||
ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
|
||||
ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
|
||||
ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
|
||||
ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
|
||||
ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
|
||||
ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
|
||||
void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
__m512i htmp[16];
|
||||
__m512i *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
ctx->bit_count += len << 3;
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
h1 = ctx->H;
|
||||
h2 = htmp;
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m512i *ht;
|
||||
compress_big_8way( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
h2 = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
ctx->ptr = ptr;
|
||||
if ( h1 != ctx->H )
|
||||
memcpy_512( ctx->H, h1, 16 );
|
||||
}
|
||||
|
||||
void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
|
||||
{
|
||||
__m512i *buf;
|
||||
__m512i h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
ptr += 8;
|
||||
h = ctx->H;
|
||||
|
||||
if ( ptr > (buf_size - 8) )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
compress_big_8way( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
|
||||
buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( ctx->bit_count );
|
||||
compress_big_8way( buf, h, h2 );
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[ u ] = h2[ u ];
|
||||
compress_big_8way( buf, final_b8, h1 );
|
||||
for (u = 0, v = 8; u < 8; u ++, v ++)
|
||||
casti_m512i( dst, u ) = h1[ v ];
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -363,7 +363,6 @@ bool register_cryptolight_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptolight;
|
||||
gate->hash = (void*)&cryptolight_hash;
|
||||
gate->hash_suw = (void*)&cryptolight_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -111,7 +111,6 @@ bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -123,7 +122,6 @@ bool register_cryptonightv7_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,7 +7,7 @@
|
||||
|
||||
// 2x128
|
||||
|
||||
/*
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
@@ -25,7 +25,181 @@ static const uint64_t IV512[] =
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static void transform_4way( cube_4way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap64_128( x4 );
|
||||
x5 = mm512_swap64_128( x5 );
|
||||
x6 = mm512_swap64_128( x6 );
|
||||
x7 = mm512_swap64_128( x7 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( y0, 11 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( y1, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap32_64( x4 );
|
||||
x5 = mm512_swap32_64( x5 );
|
||||
x6 = mm512_swap32_64( x6 );
|
||||
x7 = mm512_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
{
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
@@ -91,45 +265,35 @@ static void transform_2way( cube_2way_context *sp )
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m128i* h = (__m128i*)sp->h;
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
|
||||
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
|
||||
}
|
||||
else
|
||||
{
|
||||
h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
|
||||
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
|
||||
}
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -141,9 +305,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
@@ -164,11 +325,11 @@ int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
_mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) );
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
@@ -197,13 +358,13 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) );
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
|
||||
1,0,0,0 ) );
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
|
203
algo/cubehash/cube-hash-2way.c.save
Normal file
203
algo/cubehash/cube-hash-2way.c.save
Normal file
@@ -0,0 +1,203 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "cube-hash-2way.h"
|
||||
|
||||
// 2x128
|
||||
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
||||
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
|
||||
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
|
||||
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
|
||||
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
|
||||
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
|
||||
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_128( x4 );
|
||||
x5 = mm256_swap64_128( x5 );
|
||||
x6 = mm256_swap64_128( x6 );
|
||||
x7 = mm256_swap64_128( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap32_64( x4 );
|
||||
x5 = mm256_swap32_64( x5 );
|
||||
x6 = mm256_swap32_64( x6 );
|
||||
x7 = mm256_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
{
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,11 +1,38 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define CUBE_HASH_2WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
__m512i h[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_4way_reinit( cube_4way_context *sp );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
@@ -15,7 +42,7 @@ struct _cube_2way_context
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
|
36
algo/cubehash/cube-hash-2way.h.save
Normal file
36
algo/cubehash/cube-hash-2way.h.save
Normal file
@@ -0,0 +1,36 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
__m256i h[8];
|
||||
int hashlen; // __m128i
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -100,7 +100,6 @@ bool register_dmd_gr_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_groestl;
|
||||
gate->hash = (void*)&groestlhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -88,15 +88,3 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriadhash;
|
||||
// gate->hash_alt = (void*)&myriadhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
@@ -12,7 +12,6 @@ bool register_myriad_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -32,8 +32,6 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
//#include "miner.h"
|
||||
#include "hamsi-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -100,7 +98,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
//#include "hamsi-helper-4way.c"
|
||||
|
||||
/*
|
||||
static const sph_u32 IV512[] = {
|
||||
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
|
||||
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
|
||||
@@ -109,7 +107,7 @@ static const sph_u32 IV512[] = {
|
||||
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
|
||||
SPH_C32(0x6769756d)
|
||||
};
|
||||
|
||||
*/
|
||||
static const sph_u32 alpha_n[] = {
|
||||
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
|
||||
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
|
||||
@@ -138,6 +136,7 @@ static const sph_u32 alpha_f[] = {
|
||||
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
|
||||
};
|
||||
|
||||
|
||||
// imported from hamsi helper
|
||||
|
||||
/* Note: this table lists bits within each byte from least
|
||||
@@ -529,49 +528,34 @@ static const sph_u32 T512[64][16] = {
|
||||
SPH_C32(0xe7e00a94) }
|
||||
};
|
||||
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
const __m256i zero = _mm256_setzero_si256(); \
|
||||
__m256i db = *buf; \
|
||||
const sph_u32 *tp = &T512[0][0]; \
|
||||
m0 = zero; \
|
||||
m1 = zero; \
|
||||
m2 = zero; \
|
||||
m3 = zero; \
|
||||
m4 = zero; \
|
||||
m5 = zero; \
|
||||
m6 = zero; \
|
||||
m7 = zero; \
|
||||
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
|
||||
dm = mm256_negate_32( _mm256_or_si256( dm, \
|
||||
_mm256_slli_epi64( dm, 32 ) ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
|
||||
tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
|
||||
m256_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
|
||||
tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
|
||||
m256_const1_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
|
||||
tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
|
||||
m256_const1_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
|
||||
tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
|
||||
m256_const1_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
|
||||
tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
|
||||
m256_const1_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
|
||||
tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
|
||||
m256_const1_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
|
||||
tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
|
||||
m256_const1_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
|
||||
_mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
|
||||
tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
|
||||
tp += 0x10; \
|
||||
m256_const1_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
db = _mm256_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -662,55 +646,39 @@ do { \
|
||||
|
||||
#define ROUND_BIG(rc, alpha) \
|
||||
do { \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
|
||||
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
|
||||
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
|
||||
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
|
||||
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
|
||||
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
|
||||
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
|
||||
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
|
||||
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
|
||||
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
|
||||
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
|
||||
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
|
||||
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
|
||||
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
|
||||
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
|
||||
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
|
||||
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
|
||||
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
|
||||
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
|
||||
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
|
||||
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
|
||||
sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
|
||||
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
|
||||
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
|
||||
sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
|
||||
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
|
||||
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
|
||||
sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
|
||||
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
|
||||
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
|
||||
sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
|
||||
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
|
||||
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
|
||||
sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
|
||||
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
|
||||
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
|
||||
sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
|
||||
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
|
||||
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
|
||||
__m256i t0, t1, t2, t3; \
|
||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||
( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
|
||||
s1 = _mm256_xor_si256( s1, m256_const1_64( \
|
||||
( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
|
||||
s2 = _mm256_xor_si256( s2, m256_const1_64( \
|
||||
( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
|
||||
s3 = _mm256_xor_si256( s3, m256_const1_64( \
|
||||
( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
|
||||
s4 = _mm256_xor_si256( s4, m256_const1_64( \
|
||||
( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
|
||||
s5 = _mm256_xor_si256( s5, m256_const1_64( \
|
||||
( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
|
||||
s6 = _mm256_xor_si256( s6, m256_const1_64( \
|
||||
( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
|
||||
s7 = _mm256_xor_si256( s7, m256_const1_64( \
|
||||
( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
|
||||
s8 = _mm256_xor_si256( s8, m256_const1_64( \
|
||||
( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
|
||||
s9 = _mm256_xor_si256( s9, m256_const1_64( \
|
||||
( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
|
||||
sA = _mm256_xor_si256( sA, m256_const1_64( \
|
||||
( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
|
||||
sB = _mm256_xor_si256( sB, m256_const1_64( \
|
||||
( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
|
||||
sC = _mm256_xor_si256( sC, m256_const1_64( \
|
||||
( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
|
||||
sD = _mm256_xor_si256( sD, m256_const1_64( \
|
||||
( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
|
||||
sE = _mm256_xor_si256( sE, m256_const1_64( \
|
||||
( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
|
||||
sF = _mm256_xor_si256( sF, m256_const1_64( \
|
||||
( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
|
||||
\
|
||||
SBOX( s0, s4, s8, sC ); \
|
||||
SBOX( s1, s5, s9, sD ); \
|
||||
@@ -864,47 +832,22 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
||||
void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sph_u32 lo, hi;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
lo = 2*i;
|
||||
hi = 2*i + 1;
|
||||
sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
|
||||
IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
|
||||
}
|
||||
|
||||
sc->h[0] = m256_const1_64( 0x6c70617273746565 );
|
||||
sc->h[1] = m256_const1_64( 0x656e62656b204172 );
|
||||
sc->h[2] = m256_const1_64( 0x302c206272672031 );
|
||||
sc->h[3] = m256_const1_64( 0x3434362c75732032 );
|
||||
sc->h[4] = m256_const1_64( 0x3030312020422d33 );
|
||||
sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
|
||||
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
|
||||
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
|
||||
}
|
||||
|
||||
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
|
||||
// It looks like the only way to get in here is if core was previously called
|
||||
// with a very small len
|
||||
// That's not likely even with 80 byte input so deprecate partial len
|
||||
/*
|
||||
if ( sc->partial_len != 0 )
|
||||
{
|
||||
size_t mlen;
|
||||
|
||||
mlen = 8 - sc->partial_len;
|
||||
if ( len < mlen )
|
||||
{
|
||||
memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
|
||||
sc->partial_len += len;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
|
||||
len -= mlen;
|
||||
vdata += mlen>>3;
|
||||
hamsi_big( sc, sc->partial, 1 );
|
||||
sc->partial_len = 0;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
hamsi_big( sc, vdata, len>>3 );
|
||||
vdata += ( (len& ~(size_t)7) >> 3 );
|
||||
len &= (size_t)7;
|
||||
@@ -920,8 +863,9 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
sph_enc32be( &ch, sc->count_high );
|
||||
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
|
||||
sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
||||
0UL, 0x80UL, 0UL, 0x80UL );
|
||||
sc->buf[0] = m256_const1_64( 0x80 );
|
||||
// sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
|
||||
// 0UL, 0x80UL, 0UL, 0x80UL );
|
||||
hamsi_big( sc, sc->buf, 1 );
|
||||
hamsi_big_final( sc, pad );
|
||||
|
||||
|
@@ -92,9 +92,41 @@ extern "C"{
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
|
||||
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
#define Lb_8W(x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
do { \
|
||||
x4 = _mm512_xor_si512( x4, x1 ); \
|
||||
x5 = _mm512_xor_si512( x5, x2 ); \
|
||||
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
|
||||
x7 = _mm512_xor_si512( x7, x0 ); \
|
||||
x0 = _mm512_xor_si512( x0, x5 ); \
|
||||
x1 = _mm512_xor_si512( x1, x6 ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
|
||||
__m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
@@ -226,6 +258,48 @@ static const sph_u64 C[] = {
|
||||
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define S_8W(x0, x1, x2, x3, cb, r) do { \
|
||||
Sb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
|
||||
Sb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
|
||||
} while (0)
|
||||
|
||||
#define L_8W(x0, x1, x2, x3, x4, x5, x6, x7) do { \
|
||||
Lb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
|
||||
x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
|
||||
Lb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
|
||||
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
|
||||
} while (0)
|
||||
|
||||
#define Wz_8W(x, c, n) \
|
||||
do { \
|
||||
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
||||
x ## h = _mm512_or_si512( _mm512_and_si512( \
|
||||
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
|
||||
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
||||
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
|
||||
} while (0)
|
||||
|
||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W83(x) Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W84(x) Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W85(x) Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W86(x) \
|
||||
do { \
|
||||
__m512i t = x ## h; \
|
||||
x ## h = x ## l; \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE_8W \
|
||||
__m512i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
|
||||
__m512i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
|
||||
__m512i tmp;
|
||||
|
||||
#endif
|
||||
|
||||
#define Wz(x, c, n) \
|
||||
do { \
|
||||
@@ -236,16 +310,6 @@ do { \
|
||||
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
#define Wz(x, c, n) do { \
|
||||
sph_u64 t = (x ## h & (c)) << (n); \
|
||||
x ## h = ((x ## h >> (n)) & (c)) | t; \
|
||||
t = (x ## l & (c)) << (n); \
|
||||
x ## l = ((x ## l >> (n)) & (c)) | t; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
@@ -259,25 +323,12 @@ do { \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1)
|
||||
#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2)
|
||||
#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4)
|
||||
#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8)
|
||||
#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
|
||||
#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
|
||||
#define W6(x) do { \
|
||||
sph_u64 t = x ## h; \
|
||||
x ## h = x ## l; \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define DECL_STATE \
|
||||
__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
|
||||
__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
|
||||
__m256i tmp;
|
||||
|
||||
|
||||
#define READ_STATE(state) do { \
|
||||
h0h = (state)->H[ 0]; \
|
||||
h0l = (state)->H[ 1]; \
|
||||
@@ -316,6 +367,38 @@ do { \
|
||||
(state)->H[15] = h7l; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define INPUT_BUF1_8W \
|
||||
__m512i m0h = buf[0]; \
|
||||
__m512i m0l = buf[1]; \
|
||||
__m512i m1h = buf[2]; \
|
||||
__m512i m1l = buf[3]; \
|
||||
__m512i m2h = buf[4]; \
|
||||
__m512i m2l = buf[5]; \
|
||||
__m512i m3h = buf[6]; \
|
||||
__m512i m3l = buf[7]; \
|
||||
h0h = _mm512_xor_si512( h0h, m0h ); \
|
||||
h0l = _mm512_xor_si512( h0l, m0l ); \
|
||||
h1h = _mm512_xor_si512( h1h, m1h ); \
|
||||
h1l = _mm512_xor_si512( h1l, m1l ); \
|
||||
h2h = _mm512_xor_si512( h2h, m2h ); \
|
||||
h2l = _mm512_xor_si512( h2l, m2l ); \
|
||||
h3h = _mm512_xor_si512( h3h, m3h ); \
|
||||
h3l = _mm512_xor_si512( h3l, m3l ); \
|
||||
|
||||
#define INPUT_BUF2_8W \
|
||||
h4h = _mm512_xor_si512( h4h, m0h ); \
|
||||
h4l = _mm512_xor_si512( h4l, m0l ); \
|
||||
h5h = _mm512_xor_si512( h5h, m1h ); \
|
||||
h5l = _mm512_xor_si512( h5l, m1l ); \
|
||||
h6h = _mm512_xor_si512( h6h, m2h ); \
|
||||
h6l = _mm512_xor_si512( h6l, m2l ); \
|
||||
h7h = _mm512_xor_si512( h7h, m3h ); \
|
||||
h7l = _mm512_xor_si512( h7l, m3l ); \
|
||||
|
||||
#endif
|
||||
|
||||
#define INPUT_BUF1 \
|
||||
__m256i m0h = buf[0]; \
|
||||
__m256i m0l = buf[1]; \
|
||||
@@ -344,6 +427,7 @@ do { \
|
||||
h7h = _mm256_xor_si256( h7h, m3h ); \
|
||||
h7l = _mm256_xor_si256( h7l, m3l ); \
|
||||
|
||||
|
||||
static const sph_u64 IV256[] = {
|
||||
C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
|
||||
C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
|
||||
@@ -370,6 +454,22 @@ static const sph_u64 IV512[] = {
|
||||
#else
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define SL_8W(ro) SLu_8W(r + ro, ro)
|
||||
|
||||
#define SLu_8W(r, ro) do { \
|
||||
S_8W(h0, h2, h4, h6, Ceven_, r); \
|
||||
S_8W(h1, h3, h5, h7, Codd_, r); \
|
||||
L_8W(h0, h2, h4, h6, h1, h3, h5, h7); \
|
||||
W8 ## ro(h1); \
|
||||
W8 ## ro(h3); \
|
||||
W8 ## ro(h5); \
|
||||
W8 ## ro(h7); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define SL(ro) SLu(r + ro, ro)
|
||||
@@ -393,6 +493,23 @@ static const sph_u64 IV512[] = {
|
||||
* loop.
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define E8_8W do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
SL_8W(0); \
|
||||
SL_8W(1); \
|
||||
SL_8W(2); \
|
||||
SL_8W(3); \
|
||||
SL_8W(4); \
|
||||
SL_8W(5); \
|
||||
SL_8W(6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define E8 do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
@@ -419,51 +536,100 @@ static const sph_u64 IV512[] = {
|
||||
* On a "true 64-bit" architecture, we can unroll at will.
|
||||
*/
|
||||
|
||||
#define E8 do { \
|
||||
SLu( 0, 0); \
|
||||
SLu( 1, 1); \
|
||||
SLu( 2, 2); \
|
||||
SLu( 3, 3); \
|
||||
SLu( 4, 4); \
|
||||
SLu( 5, 5); \
|
||||
SLu( 6, 6); \
|
||||
SLu( 7, 0); \
|
||||
SLu( 8, 1); \
|
||||
SLu( 9, 2); \
|
||||
SLu(10, 3); \
|
||||
SLu(11, 4); \
|
||||
SLu(12, 5); \
|
||||
SLu(13, 6); \
|
||||
SLu(14, 0); \
|
||||
SLu(15, 1); \
|
||||
SLu(16, 2); \
|
||||
SLu(17, 3); \
|
||||
SLu(18, 4); \
|
||||
SLu(19, 5); \
|
||||
SLu(20, 6); \
|
||||
SLu(21, 0); \
|
||||
SLu(22, 1); \
|
||||
SLu(23, 2); \
|
||||
SLu(24, 3); \
|
||||
SLu(25, 4); \
|
||||
SLu(26, 5); \
|
||||
SLu(27, 6); \
|
||||
SLu(28, 0); \
|
||||
SLu(29, 1); \
|
||||
SLu(30, 2); \
|
||||
SLu(31, 3); \
|
||||
SLu(32, 4); \
|
||||
SLu(33, 5); \
|
||||
SLu(34, 6); \
|
||||
SLu(35, 0); \
|
||||
SLu(36, 1); \
|
||||
SLu(37, 2); \
|
||||
SLu(38, 3); \
|
||||
SLu(39, 4); \
|
||||
SLu(40, 5); \
|
||||
SLu(41, 6); \
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define E8_8W do { \
|
||||
SLu_8W( 0, 0); \
|
||||
SLu_8W( 1, 1); \
|
||||
SLu_8W( 2, 2); \
|
||||
SLu_8W( 3, 3); \
|
||||
SLu_8W( 4, 4); \
|
||||
SLu_8W( 5, 5); \
|
||||
SLu_8W( 6, 6); \
|
||||
SLu_8W( 7, 0); \
|
||||
SLu_8W( 8, 1); \
|
||||
SLu_8W( 9, 2); \
|
||||
SLu_8W(10, 3); \
|
||||
SLu_8W(11, 4); \
|
||||
SLu_8W(12, 5); \
|
||||
SLu_8W(13, 6); \
|
||||
SLu_8W(14, 0); \
|
||||
SLu_8W(15, 1); \
|
||||
SLu_8W(16, 2); \
|
||||
SLu_8W(17, 3); \
|
||||
SLu_8W(18, 4); \
|
||||
SLu_8W(19, 5); \
|
||||
SLu_8W(20, 6); \
|
||||
SLu_8W(21, 0); \
|
||||
SLu_8W(22, 1); \
|
||||
SLu_8W(23, 2); \
|
||||
SLu_8W(24, 3); \
|
||||
SLu_8W(25, 4); \
|
||||
SLu_8W(26, 5); \
|
||||
SLu_8W(27, 6); \
|
||||
SLu_8W(28, 0); \
|
||||
SLu_8W(29, 1); \
|
||||
SLu_8W(30, 2); \
|
||||
SLu_8W(31, 3); \
|
||||
SLu_8W(32, 4); \
|
||||
SLu_8W(33, 5); \
|
||||
SLu_8W(34, 6); \
|
||||
SLu_8W(35, 0); \
|
||||
SLu_8W(36, 1); \
|
||||
SLu_8W(37, 2); \
|
||||
SLu_8W(38, 3); \
|
||||
SLu_8W(39, 4); \
|
||||
SLu_8W(40, 5); \
|
||||
SLu_8W(41, 6); \
|
||||
} while (0)
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#define E8 do { \
|
||||
SLu( 0, 0); \
|
||||
SLu( 1, 1); \
|
||||
SLu( 2, 2); \
|
||||
SLu( 3, 3); \
|
||||
SLu( 4, 4); \
|
||||
SLu( 5, 5); \
|
||||
SLu( 6, 6); \
|
||||
SLu( 7, 0); \
|
||||
SLu( 8, 1); \
|
||||
SLu( 9, 2); \
|
||||
SLu(10, 3); \
|
||||
SLu(11, 4); \
|
||||
SLu(12, 5); \
|
||||
SLu(13, 6); \
|
||||
SLu(14, 0); \
|
||||
SLu(15, 1); \
|
||||
SLu(16, 2); \
|
||||
SLu(17, 3); \
|
||||
SLu(18, 4); \
|
||||
SLu(19, 5); \
|
||||
SLu(20, 6); \
|
||||
SLu(21, 0); \
|
||||
SLu(22, 1); \
|
||||
SLu(23, 2); \
|
||||
SLu(24, 3); \
|
||||
SLu(25, 4); \
|
||||
SLu(26, 5); \
|
||||
SLu(27, 6); \
|
||||
SLu(28, 0); \
|
||||
SLu(29, 1); \
|
||||
SLu(30, 2); \
|
||||
SLu(31, 3); \
|
||||
SLu(32, 4); \
|
||||
SLu(33, 5); \
|
||||
SLu(34, 6); \
|
||||
SLu(35, 0); \
|
||||
SLu(36, 1); \
|
||||
SLu(37, 2); \
|
||||
SLu(38, 3); \
|
||||
SLu(39, 4); \
|
||||
SLu(40, 5); \
|
||||
SLu(41, 6); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -471,6 +637,158 @@ static const sph_u64 IV512[] = {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
|
||||
sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
|
||||
sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
|
||||
sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
|
||||
sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
|
||||
void jh512_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV512
|
||||
sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
|
||||
sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
|
||||
sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *buf;
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
const int buf_size = 64; // 64 * _m512i
|
||||
size_t ptr;
|
||||
DECL_STATE_8W
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
INPUT_BUF1_8W;
|
||||
E8_8W;
|
||||
INPUT_BUF2_8W;
|
||||
sc->block_count ++;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t out_size_w32, const void *iv )
|
||||
{
|
||||
__m512i buf[16*4];
|
||||
__m512i *dst512 = (__m512i*)dst;
|
||||
size_t numz, u;
|
||||
sph_u64 l0, l1, l0e, l1e;
|
||||
|
||||
buf[0] = m512_const1_64( 0x80ULL );
|
||||
|
||||
if ( sc->ptr == 0 )
|
||||
numz = 48;
|
||||
else
|
||||
numz = 112 - sc->ptr;
|
||||
|
||||
memset_zero_512( buf+1, (numz>>3) - 1 );
|
||||
|
||||
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
|
||||
l1 = SPH_T64(sc->block_count >> 55);
|
||||
sph_enc64be( &l0e, l0 );
|
||||
sph_enc64be( &l1e, l1 );
|
||||
*(buf + (numz>>3) ) = _mm512_set1_epi64( l1e );
|
||||
*(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
|
||||
|
||||
jh_8way_core( sc, buf, numz + 16 );
|
||||
|
||||
for ( u=0; u < 8; u++ )
|
||||
buf[u] = sc->H[u+8];
|
||||
|
||||
memcpy_512( dst512, buf, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
jh256_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_8way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
jh256_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
void
|
||||
jh512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_8way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
jh512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void jh256_4way_init( jh_4way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
@@ -595,16 +913,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
memcpy_256( dst256, buf, 8 );
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
jh256_4way_init(void *cc)
|
||||
{
|
||||
jhs_4way_init(cc, IV256);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
jh256_4way(void *cc, const void *data, size_t len)
|
||||
jh256_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_4way_core(cc, data, len);
|
||||
}
|
||||
@@ -615,16 +925,8 @@ jh256_4way_close(void *cc, void *dst)
|
||||
jh_4way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
jh512_4way_init(void *cc)
|
||||
{
|
||||
jhb_4way_init(cc, IV512);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
jh512_4way(void *cc, const void *data, size_t len)
|
||||
jh512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_4way_core(cc, data, len);
|
||||
}
|
||||
@@ -635,6 +937,7 @@ jh512_4way_close(void *cc, void *dst)
|
||||
jh_4way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -60,20 +60,41 @@ extern "C"{
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
__m512i buf[8];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
} jh_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_8way_context jh256_8way_context;
|
||||
|
||||
typedef jh_8way_context jh512_8way_context;
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc);
|
||||
|
||||
void jh256_8way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh256_8way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_8way_init( jh_8way_context *sc );
|
||||
|
||||
void jh512_8way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh512_8way_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
/*
|
||||
unsigned char buf[64];
|
||||
size_t ptr;
|
||||
union {
|
||||
sph_u64 wide[16];
|
||||
} H;
|
||||
sph_u64 block_count;
|
||||
*/
|
||||
} jh_4way_context;
|
||||
} jh_4way_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_4way_context jh256_4way_context;
|
||||
|
||||
@@ -81,13 +102,15 @@ typedef jh_4way_context jh512_4way_context;
|
||||
|
||||
void jh256_4way_init( jh_4way_context *sc);
|
||||
|
||||
void jh256_4way(void *cc, const void *data, size_t len);
|
||||
void jh256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh256_4way jh256_4way_update
|
||||
|
||||
void jh256_4way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_4way_init( jh_4way_context *sc );
|
||||
|
||||
void jh512_4way(void *cc, const void *data, size_t len);
|
||||
void jh512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh512_4way jh512_4way_update
|
||||
|
||||
void jh512_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -95,6 +118,6 @@ void jh512_4way_close(void *cc, void *dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
||||
|
||||
#endif
|
||||
|
@@ -1,18 +1,68 @@
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#ifdef KECCAK_4WAY
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
||||
void keccakhash_8way(void *state, const void *input)
|
||||
{
|
||||
keccak256_8way_context ctx;
|
||||
keccak256_8way_init( &ctx );
|
||||
keccak256_8way_update( &ctx, input, 80 );
|
||||
keccak256_8way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
keccakhash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void keccakhash_4way(void *state, const void *input)
|
||||
{
|
||||
keccak256_4way_context ctx;
|
||||
keccak256_4way_init( &ctx );
|
||||
keccak256_4way( &ctx, input, 80 );
|
||||
keccak256_4way_update( &ctx, input, 80 );
|
||||
keccak256_4way_close( &ctx, state );
|
||||
}
|
||||
|
||||
@@ -28,8 +78,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
// const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
do {
|
||||
@@ -39,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
keccakhash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
|
@@ -1,35 +1,38 @@
|
||||
#include "keccak-gate.h"
|
||||
|
||||
int64_t keccak_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_keccak_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->get_max64 = (void*)&keccak_get_max64;
|
||||
opt_target_factor = 128.0;
|
||||
#if defined (KECCAK_4WAY)
|
||||
#if defined (KECCAK_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_8way;
|
||||
gate->hash = (void*)&keccakhash_8way;
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_keccakc_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
|
||||
gate->get_max64 = (void*)&keccak_get_max64;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (KECCAK_4WAY)
|
||||
#if defined (KECCAK_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_8way;
|
||||
gate->hash = (void*)&keccakhash_8way;
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,23 +1,33 @@
|
||||
#ifndef KECCAK_GATE_H__
|
||||
#define KECCAK_GATE_H__
|
||||
#define KECCAK_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define KECCAK_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define KECCAK_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define KECCAK_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(KECCAK_4WAY)
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
||||
void keccakhash_8way( void *state, const void *input );
|
||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void keccakhash_4way( void *state, const void *input );
|
||||
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void keccakhash( void *state, const void *input );
|
||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,23 +1,24 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static const sph_u64 RC[] = {
|
||||
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
|
||||
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
|
||||
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
|
||||
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
|
||||
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
|
||||
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
|
||||
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
|
||||
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
|
||||
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
|
||||
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
|
||||
static const uint64_t RC[] = {
|
||||
0x0000000000000001, 0x0000000000008082,
|
||||
0x800000000000808A, 0x8000000080008000,
|
||||
0x000000000000808B, 0x0000000080000001,
|
||||
0x8000000080008081, 0x8000000000008009,
|
||||
0x000000000000008A, 0x0000000000000088,
|
||||
0x0000000080008009, 0x000000008000000A,
|
||||
0x000000008000808B, 0x800000000000008B,
|
||||
0x8000000000008089, 0x8000000000008003,
|
||||
0x8000000000008002, 0x8000000000000080,
|
||||
0x000000000000800A, 0x800000008000000A,
|
||||
0x8000000080008081, 0x8000000000008080,
|
||||
0x0000000080000001, 0x8000000080008008
|
||||
};
|
||||
|
||||
// generic macros
|
||||
|
||||
#define a00 (kc->w[ 0])
|
||||
#define a10 (kc->w[ 1])
|
||||
#define a20 (kc->w[ 2])
|
||||
@@ -48,6 +49,197 @@ static const sph_u64 RC[] = {
|
||||
#define READ_STATE(sc)
|
||||
#define WRITE_STATE(sc)
|
||||
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64_IOTA XOR64
|
||||
|
||||
#define LPAR (
|
||||
#define RPAR )
|
||||
|
||||
#define DO(x) x
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
for (j = 0; j < (size>>3); j++ ) \
|
||||
kc->w[j ] = _mm512_xor_si512( kc->w[j], buf[j] ); \
|
||||
} while (0)
|
||||
|
||||
// Targetted macros, keccak-macros.h is included for each target.
|
||||
|
||||
#define DECL64(x) __m512i x
|
||||
#define XOR64(d, a, b) (d = _mm512_xor_si512(a,b))
|
||||
#define AND64(d, a, b) (d = _mm512_and_si512(a,b))
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
#define KECCAK_F_1600 DO(KECCAK_F_1600_512)
|
||||
|
||||
#define KECCAK_F_1600_512 do { \
|
||||
int j; \
|
||||
for (j = 0; j < 24; j += 8) \
|
||||
{ \
|
||||
KF_ELT( 0, 1, _mm512_set1_epi64( RC[j + 0] ) ); \
|
||||
KF_ELT( 1, 2, _mm512_set1_epi64( RC[j + 1] ) ); \
|
||||
KF_ELT( 2, 3, _mm512_set1_epi64( RC[j + 2] ) ); \
|
||||
KF_ELT( 3, 4, _mm512_set1_epi64( RC[j + 3] ) ); \
|
||||
KF_ELT( 4, 5, _mm512_set1_epi64( RC[j + 4] ) ); \
|
||||
KF_ELT( 5, 6, _mm512_set1_epi64( RC[j + 5] ) ); \
|
||||
KF_ELT( 6, 7, _mm512_set1_epi64( RC[j + 6] ) ); \
|
||||
KF_ELT( 7, 8, _mm512_set1_epi64( RC[j + 7] ) ); \
|
||||
P8_TO_P0; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void keccak64_8way_init( keccak64_ctx_m512i *kc, unsigned out_size )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
__m512i neg1 = m512_neg1;
|
||||
|
||||
// Initialization for the "lane complement".
|
||||
kc->w[ 0] = zero; kc->w[ 1] = neg1;
|
||||
kc->w[ 2] = neg1; kc->w[ 3] = zero;
|
||||
kc->w[ 4] = zero; kc->w[ 5] = zero;
|
||||
kc->w[ 6] = zero; kc->w[ 7] = zero;
|
||||
kc->w[ 8] = neg1; kc->w[ 9] = zero;
|
||||
kc->w[10] = zero; kc->w[11] = zero;
|
||||
kc->w[12] = neg1; kc->w[13] = zero;
|
||||
kc->w[14] = zero; kc->w[15] = zero;
|
||||
kc->w[16] = zero; kc->w[17] = neg1;
|
||||
kc->w[18] = zero; kc->w[19] = zero;
|
||||
kc->w[20] = neg1; kc->w[21] = zero;
|
||||
kc->w[22] = zero; kc->w[23] = zero;
|
||||
kc->w[24] = zero; kc->ptr = 0;
|
||||
kc->lim = 200 - (out_size >> 2);
|
||||
}
|
||||
|
||||
static void
|
||||
keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
size_t lim )
|
||||
{
|
||||
__m512i *buf;
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
size_t ptr;
|
||||
DECL_STATE
|
||||
|
||||
buf = kc->buf;
|
||||
ptr = kc->ptr;
|
||||
|
||||
if ( len < (lim - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
kc->ptr = ptr + len;
|
||||
return;
|
||||
}
|
||||
READ_STATE( kc );
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (lim - ptr);
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == lim )
|
||||
{
|
||||
INPUT_BUF( lim );
|
||||
KECCAK_F_1600;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE( kc );
|
||||
kc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
size_t byte_len, size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m512i tmp[lim + 1];
|
||||
sph_u64 dummy; /* for alignment */
|
||||
} u;
|
||||
size_t j;
|
||||
size_t m512_len = byte_len >> 3;
|
||||
|
||||
eb = 0x100 >> 8;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = m512_const1_64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = m512_const1_64( eb );
|
||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
NOT64( kc->w[ 8], kc->w[ 8] );
|
||||
NOT64( kc->w[12], kc->w[12] );
|
||||
NOT64( kc->w[17], kc->w[17] );
|
||||
NOT64( kc->w[20], kc->w[20] );
|
||||
memcpy_512( dst, kc->w, m512_len );
|
||||
}
|
||||
|
||||
void keccak256_8way_init( void *kc )
|
||||
{
|
||||
keccak64_8way_init( kc, 256 );
|
||||
}
|
||||
|
||||
void
|
||||
keccak256_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64_8way_core(cc, data, len, 136);
|
||||
}
|
||||
|
||||
void
|
||||
keccak256_8way_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64_8way_close(cc, dst, 32, 136);
|
||||
}
|
||||
|
||||
void keccak512_8way_init( void *kc )
|
||||
{
|
||||
keccak64_8way_init( kc, 512 );
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64_8way_core(cc, data, len, 72);
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64_8way_close(cc, dst, 64, 72);
|
||||
}
|
||||
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
for (j = 0; j < (size>>3); j++ ) \
|
||||
@@ -55,314 +247,28 @@ static const sph_u64 RC[] = {
|
||||
} while (0)
|
||||
|
||||
#define DECL64(x) __m256i x
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
|
||||
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOR64_IOTA XOR64
|
||||
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
DECL64(tt2); \
|
||||
DECL64(tt3); \
|
||||
XOR64(tt0, d0, d1); \
|
||||
XOR64(tt1, d2, d3); \
|
||||
XOR64(tt0, tt0, d4); \
|
||||
XOR64(tt0, tt0, tt1); \
|
||||
ROL64(tt0, tt0, 1); \
|
||||
XOR64(tt2, c0, c1); \
|
||||
XOR64(tt3, c2, c3); \
|
||||
XOR64(tt0, tt0, c4); \
|
||||
XOR64(tt2, tt2, tt3); \
|
||||
XOR64(t, tt0, tt2); \
|
||||
} while (0)
|
||||
#include "keccak-macros.c"
|
||||
|
||||
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
DECL64(t0); \
|
||||
DECL64(t1); \
|
||||
DECL64(t2); \
|
||||
DECL64(t3); \
|
||||
DECL64(t4); \
|
||||
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
|
||||
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
|
||||
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
|
||||
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
|
||||
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
|
||||
XOR64(b00, b00, t0); \
|
||||
XOR64(b01, b01, t0); \
|
||||
XOR64(b02, b02, t0); \
|
||||
XOR64(b03, b03, t0); \
|
||||
XOR64(b04, b04, t0); \
|
||||
XOR64(b10, b10, t1); \
|
||||
XOR64(b11, b11, t1); \
|
||||
XOR64(b12, b12, t1); \
|
||||
XOR64(b13, b13, t1); \
|
||||
XOR64(b14, b14, t1); \
|
||||
XOR64(b20, b20, t2); \
|
||||
XOR64(b21, b21, t2); \
|
||||
XOR64(b22, b22, t2); \
|
||||
XOR64(b23, b23, t2); \
|
||||
XOR64(b24, b24, t2); \
|
||||
XOR64(b30, b30, t3); \
|
||||
XOR64(b31, b31, t3); \
|
||||
XOR64(b32, b32, t3); \
|
||||
XOR64(b33, b33, t3); \
|
||||
XOR64(b34, b34, t3); \
|
||||
XOR64(b40, b40, t4); \
|
||||
XOR64(b41, b41, t4); \
|
||||
XOR64(b42, b42, t4); \
|
||||
XOR64(b43, b43, t4); \
|
||||
XOR64(b44, b44, t4); \
|
||||
} while (0)
|
||||
#define KECCAK_F_1600 DO(KECCAK_F_1600_256)
|
||||
|
||||
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
/* ROL64(b00, b00, 0); */ \
|
||||
ROL64(b01, b01, 36); \
|
||||
ROL64(b02, b02, 3); \
|
||||
ROL64(b03, b03, 41); \
|
||||
ROL64(b04, b04, 18); \
|
||||
ROL64(b10, b10, 1); \
|
||||
ROL64(b11, b11, 44); \
|
||||
ROL64(b12, b12, 10); \
|
||||
ROL64(b13, b13, 45); \
|
||||
ROL64(b14, b14, 2); \
|
||||
ROL64(b20, b20, 62); \
|
||||
ROL64(b21, b21, 6); \
|
||||
ROL64(b22, b22, 43); \
|
||||
ROL64(b23, b23, 15); \
|
||||
ROL64(b24, b24, 61); \
|
||||
ROL64(b30, b30, 28); \
|
||||
ROL64(b31, b31, 55); \
|
||||
ROL64(b32, b32, 25); \
|
||||
ROL64(b33, b33, 21); \
|
||||
ROL64(b34, b34, 56); \
|
||||
ROL64(b40, b40, 27); \
|
||||
ROL64(b41, b41, 20); \
|
||||
ROL64(b42, b42, 39); \
|
||||
ROL64(b43, b43, 8); \
|
||||
ROL64(b44, b44, 14); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* The KHI macro integrates the "lane complement" optimization. On input,
|
||||
* some words are complemented:
|
||||
* a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
|
||||
* On output, the following words are complemented:
|
||||
* a04 a10 a20 a22 a23 a31
|
||||
*
|
||||
* The (implicit) permutation and the theta expansion will bring back
|
||||
* the input mask for the next round.
|
||||
*/
|
||||
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
OR64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
AND64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
DECL64(c0); \
|
||||
DECL64(c1); \
|
||||
DECL64(c2); \
|
||||
DECL64(c3); \
|
||||
DECL64(c4); \
|
||||
DECL64(bnn); \
|
||||
NOT64(bnn, b20); \
|
||||
KHI_XO(c0, b00, b10, b20); \
|
||||
KHI_XO(c1, b10, bnn, b30); \
|
||||
KHI_XA(c2, b20, b30, b40); \
|
||||
KHI_XO(c3, b30, b40, b00); \
|
||||
KHI_XA(c4, b40, b00, b10); \
|
||||
MOV64(b00, c0); \
|
||||
MOV64(b10, c1); \
|
||||
MOV64(b20, c2); \
|
||||
MOV64(b30, c3); \
|
||||
MOV64(b40, c4); \
|
||||
NOT64(bnn, b41); \
|
||||
KHI_XO(c0, b01, b11, b21); \
|
||||
KHI_XA(c1, b11, b21, b31); \
|
||||
KHI_XO(c2, b21, b31, bnn); \
|
||||
KHI_XO(c3, b31, b41, b01); \
|
||||
KHI_XA(c4, b41, b01, b11); \
|
||||
MOV64(b01, c0); \
|
||||
MOV64(b11, c1); \
|
||||
MOV64(b21, c2); \
|
||||
MOV64(b31, c3); \
|
||||
MOV64(b41, c4); \
|
||||
NOT64(bnn, b32); \
|
||||
KHI_XO(c0, b02, b12, b22); \
|
||||
KHI_XA(c1, b12, b22, b32); \
|
||||
KHI_XA(c2, b22, bnn, b42); \
|
||||
KHI_XO(c3, bnn, b42, b02); \
|
||||
KHI_XA(c4, b42, b02, b12); \
|
||||
MOV64(b02, c0); \
|
||||
MOV64(b12, c1); \
|
||||
MOV64(b22, c2); \
|
||||
MOV64(b32, c3); \
|
||||
MOV64(b42, c4); \
|
||||
NOT64(bnn, b33); \
|
||||
KHI_XA(c0, b03, b13, b23); \
|
||||
KHI_XO(c1, b13, b23, b33); \
|
||||
KHI_XO(c2, b23, bnn, b43); \
|
||||
KHI_XA(c3, bnn, b43, b03); \
|
||||
KHI_XO(c4, b43, b03, b13); \
|
||||
MOV64(b03, c0); \
|
||||
MOV64(b13, c1); \
|
||||
MOV64(b23, c2); \
|
||||
MOV64(b33, c3); \
|
||||
MOV64(b43, c4); \
|
||||
NOT64(bnn, b14); \
|
||||
KHI_XA(c0, b04, bnn, b24); \
|
||||
KHI_XO(c1, bnn, b24, b34); \
|
||||
KHI_XA(c2, b24, b34, b44); \
|
||||
KHI_XO(c3, b34, b44, b04); \
|
||||
KHI_XA(c4, b44, b04, b14); \
|
||||
MOV64(b04, c0); \
|
||||
MOV64(b14, c1); \
|
||||
MOV64(b24, c2); \
|
||||
MOV64(b34, c3); \
|
||||
MOV64(b44, c4); \
|
||||
} while (0)
|
||||
|
||||
#define IOTA(r) XOR64_IOTA(a00, a00, r)
|
||||
|
||||
#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
|
||||
a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
|
||||
#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
|
||||
a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
|
||||
#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
|
||||
a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
|
||||
#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
|
||||
a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
|
||||
#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
|
||||
a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
|
||||
#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
|
||||
a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
|
||||
#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
|
||||
a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
|
||||
#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
|
||||
a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
|
||||
#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
|
||||
a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
|
||||
#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
|
||||
a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
|
||||
#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
|
||||
a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
|
||||
#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
|
||||
a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
|
||||
#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
|
||||
a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
|
||||
#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
|
||||
a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
|
||||
#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
|
||||
a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
|
||||
#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
|
||||
a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
|
||||
#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
|
||||
a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
|
||||
#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
|
||||
a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
|
||||
#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
|
||||
a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
|
||||
#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
|
||||
a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
|
||||
#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
|
||||
a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
|
||||
#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
|
||||
a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
|
||||
#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
|
||||
a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
|
||||
#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
|
||||
a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
|
||||
|
||||
#define P8_TO_P0 do { \
|
||||
DECL64(t); \
|
||||
MOV64(t, a01); \
|
||||
MOV64(a01, a11); \
|
||||
MOV64(a11, a43); \
|
||||
MOV64(a43, t); \
|
||||
MOV64(t, a02); \
|
||||
MOV64(a02, a22); \
|
||||
MOV64(a22, a31); \
|
||||
MOV64(a31, t); \
|
||||
MOV64(t, a03); \
|
||||
MOV64(a03, a33); \
|
||||
MOV64(a33, a24); \
|
||||
MOV64(a24, t); \
|
||||
MOV64(t, a04); \
|
||||
MOV64(a04, a44); \
|
||||
MOV64(a44, a12); \
|
||||
MOV64(a12, t); \
|
||||
MOV64(t, a10); \
|
||||
MOV64(a10, a32); \
|
||||
MOV64(a32, a13); \
|
||||
MOV64(a13, t); \
|
||||
MOV64(t, a14); \
|
||||
MOV64(a14, a21); \
|
||||
MOV64(a21, a20); \
|
||||
MOV64(a20, t); \
|
||||
MOV64(t, a23); \
|
||||
MOV64(a23, a42); \
|
||||
MOV64(a42, a40); \
|
||||
MOV64(a40, t); \
|
||||
MOV64(t, a30); \
|
||||
MOV64(a30, a41); \
|
||||
MOV64(a41, a34); \
|
||||
MOV64(a34, t); \
|
||||
} while (0)
|
||||
|
||||
#define LPAR (
|
||||
#define RPAR )
|
||||
|
||||
#define KF_ELT(r, s, k) do { \
|
||||
THETA LPAR P ## r RPAR; \
|
||||
RHO LPAR P ## r RPAR; \
|
||||
KHI LPAR P ## s RPAR; \
|
||||
IOTA(k); \
|
||||
} while (0)
|
||||
|
||||
#define DO(x) x
|
||||
|
||||
#define KECCAK_F_1600 DO(KECCAK_F_1600_)
|
||||
|
||||
#define KECCAK_F_1600_ do { \
|
||||
#define KECCAK_F_1600_256 do { \
|
||||
int j; \
|
||||
for (j = 0; j < 24; j += 8) \
|
||||
{ \
|
||||
KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
|
||||
RC[j + 0], RC[j + 0])) ); \
|
||||
KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
|
||||
RC[j + 1], RC[j + 1])) ); \
|
||||
KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
|
||||
RC[j + 2], RC[j + 2])) ); \
|
||||
KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
|
||||
RC[j + 3], RC[j + 3])) ); \
|
||||
KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
|
||||
RC[j + 4], RC[j + 4])) ); \
|
||||
KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
|
||||
RC[j + 5], RC[j + 5])) ); \
|
||||
KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
|
||||
RC[j + 6], RC[j + 6])) ); \
|
||||
KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
|
||||
RC[j + 7], RC[j + 7])) ); \
|
||||
KF_ELT( 0, 1, _mm256_set1_epi64x( RC[j + 0] ) ); \
|
||||
KF_ELT( 1, 2, _mm256_set1_epi64x( RC[j + 1] ) ); \
|
||||
KF_ELT( 2, 3, _mm256_set1_epi64x( RC[j + 2] ) ); \
|
||||
KF_ELT( 3, 4, _mm256_set1_epi64x( RC[j + 3] ) ); \
|
||||
KF_ELT( 4, 5, _mm256_set1_epi64x( RC[j + 4] ) ); \
|
||||
KF_ELT( 5, 6, _mm256_set1_epi64x( RC[j + 5] ) ); \
|
||||
KF_ELT( 6, 7, _mm256_set1_epi64x( RC[j + 6] ) ); \
|
||||
KF_ELT( 7, 8, _mm256_set1_epi64x( RC[j + 7] ) ); \
|
||||
P8_TO_P0; \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -453,7 +359,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
|
||||
u.tmp[0] = m256_const1_64( eb );
|
||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
|
||||
}
|
||||
@@ -474,7 +380,7 @@ void keccak256_4way_init( void *kc )
|
||||
}
|
||||
|
||||
void
|
||||
keccak256_4way(void *cc, const void *data, size_t len)
|
||||
keccak256_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64_core(cc, data, len, 136);
|
||||
}
|
||||
@@ -491,15 +397,24 @@ void keccak512_4way_init( void *kc )
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_4way(void *cc, const void *data, size_t len)
|
||||
keccak512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64_core(cc, data, len, 72);
|
||||
keccak64_core(cc, data, len, 72);
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64_close(cc, dst, 64, 72);
|
||||
keccak64_close(cc, dst, 64, 72);
|
||||
}
|
||||
|
||||
#endif
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
|
||||
#endif // AVX2
|
||||
|
@@ -64,26 +64,49 @@ extern "C"{
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[144*8]; /* first field, for alignment */
|
||||
__m512i buf[144*8];
|
||||
__m512i w[25];
|
||||
size_t ptr, lim;
|
||||
} keccak64_ctx_m512i __attribute__((aligned(128)));
|
||||
|
||||
typedef keccak64_ctx_m512i keccak256_8way_context;
|
||||
typedef keccak64_ctx_m512i keccak512_8way_context;
|
||||
|
||||
void keccak256_8way_init(void *cc);
|
||||
void keccak256_8way_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_8way_close(void *cc, void *dst);
|
||||
|
||||
void keccak512_8way_init(void *cc);
|
||||
void keccak512_8way_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_8way_close(void *cc, void *dst);
|
||||
void keccak512_8way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[144*8];
|
||||
__m256i w[25];
|
||||
size_t ptr, lim;
|
||||
// sph_u64 wide[25];
|
||||
} keccak64_ctx_m256i;
|
||||
} keccak64_ctx_m256i __attribute__((aligned(128)));
|
||||
|
||||
typedef keccak64_ctx_m256i keccak256_4way_context;
|
||||
typedef keccak64_ctx_m256i keccak512_4way_context;
|
||||
|
||||
void keccak256_4way_init(void *cc);
|
||||
void keccak256_4way(void *cc, const void *data, size_t len);
|
||||
void keccak256_4way_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_4way_close(void *cc, void *dst);
|
||||
|
||||
#define keccak256_4way keccak256_4way_update
|
||||
|
||||
void keccak512_4way_init(void *cc);
|
||||
void keccak512_4way(void *cc, const void *data, size_t len);
|
||||
void keccak512_4way_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_4way_close(void *cc, void *dst);
|
||||
void keccak512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
#define keccak512_4way keccak512_4way_update
|
||||
|
||||
#endif
|
||||
|
||||
|
324
algo/keccak/keccak-macros.c
Normal file
324
algo/keccak/keccak-macros.c
Normal file
@@ -0,0 +1,324 @@
|
||||
#ifdef TH_ELT
|
||||
#undef TH_ELT
|
||||
#endif
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
DECL64(tt0); \
|
||||
DECL64(tt1); \
|
||||
DECL64(tt2); \
|
||||
DECL64(tt3); \
|
||||
XOR64(tt0, d0, d1); \
|
||||
XOR64(tt1, d2, d3); \
|
||||
XOR64(tt0, tt0, d4); \
|
||||
XOR64(tt0, tt0, tt1); \
|
||||
ROL64(tt0, tt0, 1); \
|
||||
XOR64(tt2, c0, c1); \
|
||||
XOR64(tt3, c2, c3); \
|
||||
XOR64(tt0, tt0, c4); \
|
||||
XOR64(tt2, tt2, tt3); \
|
||||
XOR64(t, tt0, tt2); \
|
||||
} while (0)
|
||||
|
||||
#ifdef THETA
|
||||
#undef THETA
|
||||
#endif
|
||||
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
DECL64(t0); \
|
||||
DECL64(t1); \
|
||||
DECL64(t2); \
|
||||
DECL64(t3); \
|
||||
DECL64(t4); \
|
||||
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
|
||||
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
|
||||
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
|
||||
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
|
||||
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
|
||||
XOR64(b00, b00, t0); \
|
||||
XOR64(b01, b01, t0); \
|
||||
XOR64(b02, b02, t0); \
|
||||
XOR64(b03, b03, t0); \
|
||||
XOR64(b04, b04, t0); \
|
||||
XOR64(b10, b10, t1); \
|
||||
XOR64(b11, b11, t1); \
|
||||
XOR64(b12, b12, t1); \
|
||||
XOR64(b13, b13, t1); \
|
||||
XOR64(b14, b14, t1); \
|
||||
XOR64(b20, b20, t2); \
|
||||
XOR64(b21, b21, t2); \
|
||||
XOR64(b22, b22, t2); \
|
||||
XOR64(b23, b23, t2); \
|
||||
XOR64(b24, b24, t2); \
|
||||
XOR64(b30, b30, t3); \
|
||||
XOR64(b31, b31, t3); \
|
||||
XOR64(b32, b32, t3); \
|
||||
XOR64(b33, b33, t3); \
|
||||
XOR64(b34, b34, t3); \
|
||||
XOR64(b40, b40, t4); \
|
||||
XOR64(b41, b41, t4); \
|
||||
XOR64(b42, b42, t4); \
|
||||
XOR64(b43, b43, t4); \
|
||||
XOR64(b44, b44, t4); \
|
||||
} while (0)
|
||||
|
||||
#ifdef RHO
|
||||
#undef RHO
|
||||
#endif
|
||||
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
/* ROL64(b00, b00, 0); */ \
|
||||
ROL64(b01, b01, 36); \
|
||||
ROL64(b02, b02, 3); \
|
||||
ROL64(b03, b03, 41); \
|
||||
ROL64(b04, b04, 18); \
|
||||
ROL64(b10, b10, 1); \
|
||||
ROL64(b11, b11, 44); \
|
||||
ROL64(b12, b12, 10); \
|
||||
ROL64(b13, b13, 45); \
|
||||
ROL64(b14, b14, 2); \
|
||||
ROL64(b20, b20, 62); \
|
||||
ROL64(b21, b21, 6); \
|
||||
ROL64(b22, b22, 43); \
|
||||
ROL64(b23, b23, 15); \
|
||||
ROL64(b24, b24, 61); \
|
||||
ROL64(b30, b30, 28); \
|
||||
ROL64(b31, b31, 55); \
|
||||
ROL64(b32, b32, 25); \
|
||||
ROL64(b33, b33, 21); \
|
||||
ROL64(b34, b34, 56); \
|
||||
ROL64(b40, b40, 27); \
|
||||
ROL64(b41, b41, 20); \
|
||||
ROL64(b42, b42, 39); \
|
||||
ROL64(b43, b43, 8); \
|
||||
ROL64(b44, b44, 14); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* The KHI macro integrates the "lane complement" optimization. On input,
|
||||
* some words are complemented:
|
||||
* a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
|
||||
* On output, the following words are complemented:
|
||||
* a04 a10 a20 a22 a23 a31
|
||||
*
|
||||
* The (implicit) permutation and the theta expansion will bring back
|
||||
* the input mask for the next round.
|
||||
*/
|
||||
|
||||
#ifdef KHI_XO
|
||||
#undef KHI_XO
|
||||
#endif
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
OR64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#ifdef KHI_XA
|
||||
#undef KHI_XA
|
||||
#endif
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
AND64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#ifdef KHI
|
||||
#undef KHI
|
||||
#endif
|
||||
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
DECL64(c0); \
|
||||
DECL64(c1); \
|
||||
DECL64(c2); \
|
||||
DECL64(c3); \
|
||||
DECL64(c4); \
|
||||
DECL64(bnn); \
|
||||
NOT64(bnn, b20); \
|
||||
KHI_XO(c0, b00, b10, b20); \
|
||||
KHI_XO(c1, b10, bnn, b30); \
|
||||
KHI_XA(c2, b20, b30, b40); \
|
||||
KHI_XO(c3, b30, b40, b00); \
|
||||
KHI_XA(c4, b40, b00, b10); \
|
||||
MOV64(b00, c0); \
|
||||
MOV64(b10, c1); \
|
||||
MOV64(b20, c2); \
|
||||
MOV64(b30, c3); \
|
||||
MOV64(b40, c4); \
|
||||
NOT64(bnn, b41); \
|
||||
KHI_XO(c0, b01, b11, b21); \
|
||||
KHI_XA(c1, b11, b21, b31); \
|
||||
KHI_XO(c2, b21, b31, bnn); \
|
||||
KHI_XO(c3, b31, b41, b01); \
|
||||
KHI_XA(c4, b41, b01, b11); \
|
||||
MOV64(b01, c0); \
|
||||
MOV64(b11, c1); \
|
||||
MOV64(b21, c2); \
|
||||
MOV64(b31, c3); \
|
||||
MOV64(b41, c4); \
|
||||
NOT64(bnn, b32); \
|
||||
KHI_XO(c0, b02, b12, b22); \
|
||||
KHI_XA(c1, b12, b22, b32); \
|
||||
KHI_XA(c2, b22, bnn, b42); \
|
||||
KHI_XO(c3, bnn, b42, b02); \
|
||||
KHI_XA(c4, b42, b02, b12); \
|
||||
MOV64(b02, c0); \
|
||||
MOV64(b12, c1); \
|
||||
MOV64(b22, c2); \
|
||||
MOV64(b32, c3); \
|
||||
MOV64(b42, c4); \
|
||||
NOT64(bnn, b33); \
|
||||
KHI_XA(c0, b03, b13, b23); \
|
||||
KHI_XO(c1, b13, b23, b33); \
|
||||
KHI_XO(c2, b23, bnn, b43); \
|
||||
KHI_XA(c3, bnn, b43, b03); \
|
||||
KHI_XO(c4, b43, b03, b13); \
|
||||
MOV64(b03, c0); \
|
||||
MOV64(b13, c1); \
|
||||
MOV64(b23, c2); \
|
||||
MOV64(b33, c3); \
|
||||
MOV64(b43, c4); \
|
||||
NOT64(bnn, b14); \
|
||||
KHI_XA(c0, b04, bnn, b24); \
|
||||
KHI_XO(c1, bnn, b24, b34); \
|
||||
KHI_XA(c2, b24, b34, b44); \
|
||||
KHI_XO(c3, b34, b44, b04); \
|
||||
KHI_XA(c4, b44, b04, b14); \
|
||||
MOV64(b04, c0); \
|
||||
MOV64(b14, c1); \
|
||||
MOV64(b24, c2); \
|
||||
MOV64(b34, c3); \
|
||||
MOV64(b44, c4); \
|
||||
} while (0)
|
||||
|
||||
#ifdef IOTA
|
||||
#undef IOTA
|
||||
#endif
|
||||
#define IOTA(r) XOR64_IOTA(a00, a00, r)
|
||||
|
||||
#ifdef P0
|
||||
#undef P1
|
||||
#undef P2
|
||||
#undef P3
|
||||
#undef P4
|
||||
#undef P5
|
||||
#undef P6
|
||||
#undef P7
|
||||
#undef P8
|
||||
#undef P9
|
||||
#undef P10
|
||||
#undef p11
|
||||
#undef P12
|
||||
#undef P13
|
||||
#undef P14
|
||||
#undef P15
|
||||
#undef P16
|
||||
#undef P17
|
||||
#undef P18
|
||||
#undef P19
|
||||
#undef P20
|
||||
#undef P21
|
||||
#undef P22
|
||||
#undef P23
|
||||
#endif
|
||||
|
||||
#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
|
||||
a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
|
||||
#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
|
||||
a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
|
||||
#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
|
||||
a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
|
||||
#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
|
||||
a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
|
||||
#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
|
||||
a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
|
||||
#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
|
||||
a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
|
||||
#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
|
||||
a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
|
||||
#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
|
||||
a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
|
||||
#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
|
||||
a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
|
||||
#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
|
||||
a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
|
||||
#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
|
||||
a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
|
||||
#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
|
||||
a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
|
||||
#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
|
||||
a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
|
||||
#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
|
||||
a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
|
||||
#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
|
||||
a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
|
||||
#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
|
||||
a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
|
||||
#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
|
||||
a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
|
||||
#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
|
||||
a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
|
||||
#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
|
||||
a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
|
||||
#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
|
||||
a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
|
||||
#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
|
||||
a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
|
||||
#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
|
||||
a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
|
||||
#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
|
||||
a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
|
||||
#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
|
||||
a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
|
||||
|
||||
#ifdef P8_TO_P0
|
||||
#undef P8_TO_P0
|
||||
#endif
|
||||
#define P8_TO_P0 do { \
|
||||
DECL64(t); \
|
||||
MOV64(t, a01); \
|
||||
MOV64(a01, a11); \
|
||||
MOV64(a11, a43); \
|
||||
MOV64(a43, t); \
|
||||
MOV64(t, a02); \
|
||||
MOV64(a02, a22); \
|
||||
MOV64(a22, a31); \
|
||||
MOV64(a31, t); \
|
||||
MOV64(t, a03); \
|
||||
MOV64(a03, a33); \
|
||||
MOV64(a33, a24); \
|
||||
MOV64(a24, t); \
|
||||
MOV64(t, a04); \
|
||||
MOV64(a04, a44); \
|
||||
MOV64(a44, a12); \
|
||||
MOV64(a12, t); \
|
||||
MOV64(t, a10); \
|
||||
MOV64(a10, a32); \
|
||||
MOV64(a32, a13); \
|
||||
MOV64(a13, t); \
|
||||
MOV64(t, a14); \
|
||||
MOV64(a14, a21); \
|
||||
MOV64(a21, a20); \
|
||||
MOV64(a20, t); \
|
||||
MOV64(t, a23); \
|
||||
MOV64(a23, a42); \
|
||||
MOV64(a42, a40); \
|
||||
MOV64(a40, t); \
|
||||
MOV64(t, a30); \
|
||||
MOV64(a30, a41); \
|
||||
MOV64(a41, a34); \
|
||||
MOV64(a34, t); \
|
||||
} while (0)
|
||||
|
||||
#define KF_ELT(r, s, k) do { \
|
||||
THETA LPAR P ## r RPAR; \
|
||||
RHO LPAR P ## r RPAR; \
|
||||
KHI LPAR P ## s RPAR; \
|
||||
IOTA(k); \
|
||||
} while (0)
|
||||
|
||||
|
2156
algo/lanehash/lane.c
Normal file
2156
algo/lanehash/lane.c
Normal file
File diff suppressed because it is too large
Load Diff
50
algo/lanehash/lane.h
Normal file
50
algo/lanehash/lane.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Sebastiaan Indesteege
|
||||
* <sebastiaan.indesteege@esat.kuleuven.be>
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Optimised ANSI-C implementation of LANE
|
||||
*/
|
||||
|
||||
#ifndef LANE_H
|
||||
#define LANE_H
|
||||
|
||||
#include <string.h>
|
||||
//#include "algo/sha/sha3-defs.h"
|
||||
#include <stdint.h>
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
|
||||
//typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2, BAD_DATABITLEN = 3 } HashReturn;
|
||||
|
||||
//typedef unsigned char u8;
|
||||
//typedef unsigned int u32;
|
||||
//typedef unsigned long long u64;
|
||||
|
||||
typedef struct {
|
||||
int hashbitlen;
|
||||
uint64_t ctr;
|
||||
uint32_t h[16];
|
||||
uint8_t buffer[128];
|
||||
} hashState;
|
||||
|
||||
void laneInit (hashState *state, int hashbitlen);
|
||||
void laneUpdate (hashState *state, const BitSequence *data, DataLength databitlen);
|
||||
void laneFinal (hashState *state, BitSequence *hashval);
|
||||
void laneHash (int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
|
||||
|
||||
#endif /* LANE_H */
|
File diff suppressed because it is too large
Load Diff
573
algo/luffa/luffa-hash-2way.c.save
Normal file
573
algo/luffa/luffa-hash-2way.c.save
Normal file
@@ -0,0 +1,573 @@
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "luffa-hash-2way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);\
|
||||
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm256_load_si256(&a0);\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = _mm256_load_si256(&t);\
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);\
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
q1 = _mm256_load_si256(&p3);\
|
||||
s3 = _mm256_load_si256(&r3);\
|
||||
q3 = _mm256_load_si256(&p3);\
|
||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm256_load_si256(&s1);\
|
||||
q0 = _mm256_load_si256(&q1);\
|
||||
s2 = _mm256_load_si256(&s3);\
|
||||
q2 = _mm256_load_si256(&q3);\
|
||||
r3 = _mm256_load_si256(&r1);\
|
||||
p3 = _mm256_load_si256(&p1);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
{
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
|
||||
MULT2( t0, t1, MASK );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
|
||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
||||
tmp[0], tmp[1] );
|
||||
}
|
||||
|
||||
/***************************************************/
|
||||
/* Finalization function */
|
||||
/* state: hash context */
|
||||
/* b[8]: hash values */
|
||||
|
||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = m256_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
{
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
// store in buffer for transform in final for midstate to work
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
{
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
|
||||
// transform pad block
|
||||
if ( state->rembytes )
|
||||
// not empty, data is in buffer
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
finalization512_2way( state, (uint32*)hashval );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( hashval+32 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_update_close( luffa_2way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
const __m256i *vdata = (__m256i*)data;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -51,12 +51,30 @@
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
uint32 buffer[8*4];
|
||||
__m512i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
} luffa_4way_context __attribute((aligned(128)));
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
|
||||
int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_4way_close( luffa_4way_context *state, void *hashval );
|
||||
int luffa_4way_update_close( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2];
|
||||
__m256i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context __attribute((aligned(128)));
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
|
69
algo/luffa/luffa-hash-2way.h.save
Normal file
69
algo/luffa/luffa-hash-2way.h.save
Normal file
@@ -0,0 +1,69 @@
|
||||
#if !defined(LUFFA_HASH_2WAY_H__)
|
||||
#define LUFFA_HASH_2WAY_H__ 1
|
||||
/*
|
||||
* luffa_for_sse2.h
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
*
|
||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||
*
|
||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||
* the U.S. Government and any interested party the right to use
|
||||
* this software for the purposes of the SHA-3 evaluation process,
|
||||
* notwithstanding that this software is copyrighted.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
#define DIGEST_BIT_LEN_384 384
|
||||
#define DIGEST_BIT_LEN_512 512
|
||||
|
||||
/*********************************/
|
||||
/* The parameters of Luffa */
|
||||
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
|
||||
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
|
||||
* of a message block*/
|
||||
|
||||
/* The number of blocks in Luffa */
|
||||
#define WIDTH_224 3
|
||||
#define WIDTH_256 3
|
||||
#define WIDTH_384 4
|
||||
#define WIDTH_512 5
|
||||
|
||||
/* The limit of the length of message */
|
||||
#define LIMIT_224 64
|
||||
#define LIMIT_256 64
|
||||
#define LIMIT_384 128
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -541,7 +541,11 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = (__m256i*)state->chainv;
|
||||
__m256i t;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i zero = m128_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -555,7 +559,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -568,7 +574,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
@@ -127,7 +127,6 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -147,15 +146,12 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&lyra2h_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
int64_t allium_get_max64_0xFFFFLL() { return 0xFFFFLL; }
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_4WAY)
|
||||
@@ -168,7 +164,6 @@ bool register_allium_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->get_max64 = (void*)&allium_get_max64_0xFFFFLL;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -214,7 +209,6 @@ bool register_phi2_algo( algo_gate_t* gate )
|
||||
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
|
||||
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
|
||||
gate->build_extraheader = (void*)&phi2_build_extraheader;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined(PHI2_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_phi2_4way;
|
||||
|
@@ -113,18 +113,12 @@ int scanhash_lyra2re( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t lyra2re_get_max64 ()
|
||||
{
|
||||
return 0xffffLL;
|
||||
}
|
||||
|
||||
bool register_lyra2re_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2re_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_lyra2re;
|
||||
gate->hash = (void*)&lyra2re_hash;
|
||||
gate->get_max64 = (void*)&lyra2re_get_max64;
|
||||
opt_target_factor = 128.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -5,7 +5,6 @@
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
|
||||
|
||||
#if defined (LYRA2REV3_8WAY)
|
||||
|
||||
typedef struct {
|
||||
@@ -14,7 +13,7 @@ typedef struct {
|
||||
bmw256_8way_context bmw;
|
||||
} lyra2v3_8way_ctx_holder;
|
||||
|
||||
static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
|
||||
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
|
||||
|
||||
bool init_lyra2rev3_8way_ctx()
|
||||
{
|
||||
@@ -38,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
|
||||
|
||||
blake256_8way( &ctx.blake, input, 80 );
|
||||
blake256_8way( &ctx.blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
@@ -91,7 +90,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *hash7 = &hash[7<<3];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -99,12 +98,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
|
||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
@@ -113,17 +115,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
lyra2rev3_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
} while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
@@ -132,14 +135,14 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#if defined (LYRA2REV3_4WAY)
|
||||
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
cubehashParam cube;
|
||||
bmw256_4way_context bmw;
|
||||
} lyra2v3_4way_ctx_holder;
|
||||
|
||||
static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||
//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||
|
||||
bool init_lyra2rev3_4way_ctx()
|
||||
{
|
||||
@@ -159,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
|
||||
|
||||
blake256_4way( &ctx.blake, input, 80 );
|
||||
// blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way( &ctx.blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
@@ -205,6 +209,10 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
|
||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
||||
blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
@@ -70,7 +70,6 @@ bool register_lyra2z330_algo( algo_gate_t* gate )
|
||||
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z330;
|
||||
gate->hash = (void*)&lyra2z330_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -296,8 +296,6 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
// can this be skipped after finding a share? Seems to work ok.
|
||||
//out:
|
||||
mpf_set_prec_raw(magifpi, prec0);
|
||||
mpf_set_prec_raw(magifpi0, prec0);
|
||||
mpf_set_prec_raw(mptmp, prec0);
|
||||
@@ -323,7 +321,6 @@ bool register_m7m_algo( algo_gate_t *gate )
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ffff;
|
||||
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
|
@@ -3,22 +3,129 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(NIST5_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
||||
void nist5hash_4way( void *out, const void *input )
|
||||
#if defined(NIST5_8WAY)
|
||||
|
||||
void nist5hash_8way( void *out, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*16] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
blake512_8way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
jh512_8way_context ctx_jh;
|
||||
skein512_8way_context ctx_skein;
|
||||
keccak512_8way_context ctx_keccak;
|
||||
|
||||
blake512_8way_init( &ctx_blake );
|
||||
blake512_8way_update( &ctx_blake, input, 80 );
|
||||
blake512_8way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash0,
|
||||
(const char*)hash0, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash1,
|
||||
(const char*)hash1, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash2,
|
||||
(const char*)hash2, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash3,
|
||||
(const char*)hash3, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash4,
|
||||
(const char*)hash4, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash5,
|
||||
(const char*)hash5, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash6,
|
||||
(const char*)hash6, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash7,
|
||||
(const char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 512 );
|
||||
|
||||
jh512_8way_init( &ctx_jh );
|
||||
jh512_8way_update( &ctx_jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx_jh, vhash );
|
||||
|
||||
keccak512_8way_init( &ctx_keccak );
|
||||
keccak512_8way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx_keccak, vhash );
|
||||
|
||||
skein512_8way_init( &ctx_skein );
|
||||
skein512_8way_update( &ctx_skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx_skein, out );
|
||||
}
|
||||
|
||||
int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
nist5hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(NIST5_4WAY)
|
||||
|
||||
void nist5hash_4way( void *out, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake512_4way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
jh512_4way_context ctx_jh;
|
||||
@@ -62,62 +169,39 @@ void nist5hash_4way( void *out, const void *input )
|
||||
int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[4*24] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
|
||||
uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
{
|
||||
if (Htarg <= htmax[m])
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
nist5hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
nist5hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash7[ lane ] & mask ) == 0 )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -2,8 +2,11 @@
|
||||
|
||||
bool register_nist5_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
#if defined (NIST5_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined (NIST5_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_8way;
|
||||
gate->hash = (void*)&nist5hash_8way;
|
||||
#elif defined (NIST5_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_4way;
|
||||
gate->hash = (void*)&nist5hash_4way;
|
||||
#else
|
||||
|
@@ -1,14 +1,23 @@
|
||||
#ifndef __NIST5_GATE_H__
|
||||
#define __NIST5_GATE_H__
|
||||
#define __NIST5_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define NIST5_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define NIST5_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define NIST5_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(NIST5_4WAY)
|
||||
#if defined(NIST5_8WAY)
|
||||
|
||||
void nist5hash_8way( void *state, const void *input );
|
||||
|
||||
int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(NIST5_4WAY)
|
||||
|
||||
void nist5hash_4way( void *state, const void *input );
|
||||
|
||||
|
@@ -208,12 +208,6 @@ void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
++(*nonceptr);
|
||||
}
|
||||
|
||||
int64_t zr5_get_max64 ()
|
||||
{
|
||||
// return 0x1ffffLL;
|
||||
return 0x1fffffLL;
|
||||
}
|
||||
|
||||
void zr5_display_pok( struct work* work )
|
||||
{
|
||||
if ( work->data[0] & 0x00008000 )
|
||||
@@ -229,7 +223,6 @@ bool register_zr5_algo( algo_gate_t* gate )
|
||||
gate->get_new_work = (void*)&zr5_get_new_work;
|
||||
gate->scanhash = (void*)&scanhash_zr5;
|
||||
gate->hash = (void*)&zr5hash;
|
||||
gate->get_max64 = (void*)&zr5_get_max64;
|
||||
gate->decode_extra_data = (void*)&zr5_display_pok;
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
|
@@ -1,12 +1,8 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "quark-gate.h"
|
||||
|
||||
#if defined (QUARK_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -14,6 +10,258 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
||||
#if defined (QUARK_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
} quark_8way_ctx_holder;
|
||||
|
||||
quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
|
||||
|
||||
void init_quark_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &quark_8way_ctx.blake );
|
||||
bmw512_8way_init( &quark_8way_ctx.bmw );
|
||||
init_groestl( &quark_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &quark_8way_ctx.skein );
|
||||
jh512_8way_init( &quark_8way_ctx.jh );
|
||||
keccak512_8way_init( &quark_8way_ctx.keccak );
|
||||
}
|
||||
|
||||
void quark_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
__m512i* vh = (__m512i*)vhash;
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__mmask8 vh_mask;
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// AVX 512 cmpeq returns a bit mask instead of a vector mask.
|
||||
// This should simplify things but the logic doesn't seem to be working.
|
||||
// The problem appears to be related to the test to skip a hash if it isn't
|
||||
// to be used. Skipping the test for all 8 way hashes seems to have
|
||||
// fixed it. The hash selection blending works if the hash is produced
|
||||
// but the hash wasn't being produced when it should.
|
||||
// Both decisions are based on the same data, the __mmask8. It works
|
||||
// as a blend mask but not in a logical comparison, maybe the type is the
|
||||
// problem. Maybe a cast to int or movm is needed to make it work.
|
||||
// It's now moot because the hash can only be skipped 1 in 256 iterations
|
||||
// when hashing parallel 8 ways.
|
||||
// The performance impact of the workaround should be negligible.
|
||||
// It's a problem for another day.
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
if ( hash0[0] & mask )
|
||||
{
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
}
|
||||
if ( hash1[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
}
|
||||
if ( hash2[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
}
|
||||
if ( hash3[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
}
|
||||
if ( hash4[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4,
|
||||
(char*)hash4, 512 );
|
||||
}
|
||||
if ( hash5[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5,
|
||||
(char*)hash5, 512 );
|
||||
}
|
||||
if ( hash6[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6,
|
||||
(char*)hash6, 512 );
|
||||
}
|
||||
if ( hash7[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7,
|
||||
(char*)hash7, 512 );
|
||||
}
|
||||
|
||||
intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 512 );
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhashB );
|
||||
}
|
||||
|
||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
512 );
|
||||
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
blake512_8way_init( &ctx.blake );
|
||||
blake512_8way_update( &ctx.blake, vhash, 64 );
|
||||
blake512_8way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
bmw512_8way_init( &ctx.bmw );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhashB );
|
||||
}
|
||||
|
||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
skein512_8way_init( &ctx.skein );
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhashB );
|
||||
}
|
||||
|
||||
// Final blend, directly to state, only need 32 bytes.
|
||||
casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
|
||||
casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
|
||||
casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
|
||||
casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
|
||||
}
|
||||
|
||||
int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
quark_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined (QUARK_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
@@ -91,7 +339,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
@@ -117,14 +365,14 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
@@ -142,14 +390,14 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
|
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (QUARK_4WAY)
|
||||
#if defined (QUARK_8WAY)
|
||||
init_quark_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark_8way;
|
||||
gate->hash = (void*)&quark_8way_hash;
|
||||
#elif defined (QUARK_4WAY)
|
||||
init_quark_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark_4way;
|
||||
gate->hash = (void*)&quark_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,13 +4,22 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUARK_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define QUARK_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define QUARK_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(QUARK_4WAY)
|
||||
#if defined(QUARK_8WAY)
|
||||
|
||||
void quark_8way_hash( void *state, const void *input );
|
||||
int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_quark_8way_ctx();
|
||||
|
||||
#elif defined(QUARK_4WAY)
|
||||
|
||||
void quark_4way_hash( void *state, const void *input );
|
||||
int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "qubit-gate.h"
|
||||
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -12,6 +9,160 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
} qubit_4way_ctx_holder;
|
||||
|
||||
qubit_4way_ctx_holder qubit_4way_ctx;
|
||||
|
||||
void init_qubit_4way_ctx()
|
||||
{
|
||||
cubehashInit(&qubit_4way_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||
init_echo(&qubit_4way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void qubit_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
qubit_4way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
||||
luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
|
||||
luffa_4way_close( &ctx.luffa, vhash );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[4*24] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
|
||||
casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
|
||||
casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
||||
luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
|
||||
luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+4, n+1 );
|
||||
be32enc( noncep+8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
qubit_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( !( hash[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_lane_solution( work, hash, mythr, 0 );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_lane_solution( work, hash+8, mythr, 1 );
|
||||
}
|
||||
if ( !( hash+16[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+2;
|
||||
submit_lane_solution( work, hash, mythr, 2 );
|
||||
}
|
||||
if ( !( (hash+24)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+3;
|
||||
submit_lane_solution( work, hash+8, mythr, 3 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(QUBIT_2WAY)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_2way_context luffa;
|
||||
|
@@ -2,6 +2,13 @@
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate )
|
||||
{
|
||||
/*
|
||||
#if defined (QUBIT_4WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_4way;
|
||||
gate->hash = (void*)&qubit_4way_hash;
|
||||
#elif defined (QUBIT_4WAY)
|
||||
*/
|
||||
#if defined (QUBIT_2WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_2way;
|
||||
|
@@ -4,12 +4,26 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define QUBIT_2WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
*/
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUBIT_2WAY
|
||||
#define QUBIT_2WAY 1
|
||||
#endif
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate );
|
||||
/*
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
void qubit_4way_hash( void *state, const void *input );
|
||||
int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_qubit_4way_ctx();
|
||||
|
||||
#elif defined(QUBIT_2WAY)
|
||||
*/
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
void qubit_2way_hash( void *state, const void *input );
|
||||
|
@@ -94,8 +94,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
g_work->data[28] = 0x80000000;
|
||||
}
|
||||
|
||||
int64_t lbry_get_max64() { return 0x1ffffLL; }
|
||||
|
||||
int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
@@ -112,7 +110,6 @@ bool register_lbry_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
#endif
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->get_max64 = (void*)&lbry_get_max64;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
// gate->build_block_header = (void*)&build_block_header;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
|
@@ -1070,17 +1070,6 @@ int scanhash_neoscrypt( struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_neoscrypt_max64() { return 0x3ffff; }
|
||||
|
||||
void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
|
||||
{
|
||||
while ( !stratum->job.diff )
|
||||
{
|
||||
// applog(LOG_DEBUG, "Waiting for Stratum to set the job difficulty");
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
int neoscrypt_get_work_data_size () { return 80; }
|
||||
|
||||
bool register_neoscrypt_algo( algo_gate_t* gate )
|
||||
@@ -1088,8 +1077,6 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_neoscrypt;
|
||||
gate->hash = (void*)&neoscrypt;
|
||||
gate->get_max64 = (void*)&get_neoscrypt_max64;
|
||||
gate->wait_for_diff = (void*)&neoscrypt_wait_for_diff;
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
gate->work_decode = (void*)&std_be_work_decode;
|
||||
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
|
||||
|
@@ -483,11 +483,6 @@ int scanhash_pluck( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t pluck_get_max64 ()
|
||||
{
|
||||
return 0x1ffLL;
|
||||
}
|
||||
|
||||
bool pluck_miner_thread_init( int thr_id )
|
||||
{
|
||||
scratchbuf = malloc( 128 * 1024 );
|
||||
@@ -503,7 +498,6 @@ bool register_pluck_algo( algo_gate_t* gate )
|
||||
gate->miner_thread_init = (void*)&pluck_miner_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_pluck;
|
||||
gate->hash = (void*)&pluck_hash;
|
||||
gate->get_max64 = (void*)&pluck_get_max64;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -766,8 +766,6 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t scrypt_get_max64() { return 0xfff; }
|
||||
|
||||
bool scrypt_miner_thread_init( int thr_id )
|
||||
{
|
||||
scratchbuf = scrypt_buffer_alloc( scratchbuf_size );
|
||||
@@ -783,10 +781,8 @@ bool register_scrypt_algo( algo_gate_t* gate )
|
||||
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_scrypt;
|
||||
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;
|
||||
gate->get_max64 = (void*)&scrypt_get_max64;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
|
||||
if ( !opt_param_n )
|
||||
{
|
||||
opt_param_n = 1024;
|
||||
|
@@ -240,7 +240,6 @@ bool register_scryptjane_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_scryptjane;
|
||||
gate->hash = (void*)&scryptjanehash;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
// figure out if arg in N or Nfactor
|
||||
|
@@ -305,9 +305,11 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm128_bswap_32( _mm_set1_epi32( high ) );
|
||||
mm128_bswap_32( m128_const1_32( high ) );
|
||||
// mm128_bswap_32( _mm_set1_epi32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm128_bswap_32( _mm_set1_epi32( low ) );
|
||||
mm128_bswap_32( m128_const1_32( low ) );
|
||||
// mm128_bswap_32( _mm_set1_epi32( low ) );
|
||||
sha256_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -538,9 +540,9 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( high ) );
|
||||
mm256_bswap_32( m256_const1_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( low ) );
|
||||
mm256_bswap_32( m256_const1_32( low ) );
|
||||
|
||||
sha256_8way_round( sc, sc->buf, sc->val );
|
||||
|
||||
|
@@ -1,538 +0,0 @@
|
||||
#if 0
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sha2-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// naming convention for variables and macros
|
||||
// VARx: AVX2 8 way 32 bit
|
||||
// VARy: MMX 2 way 32 bit
|
||||
// VARz: scalar integer 32 bit
|
||||
|
||||
|
||||
static const uint32_t H256[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
|
||||
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
|
||||
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
|
||||
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
|
||||
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
|
||||
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
|
||||
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
|
||||
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
|
||||
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
|
||||
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
|
||||
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
|
||||
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
|
||||
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
|
||||
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
|
||||
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
|
||||
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||
};
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
#define CHy(X, Y, Z) \
|
||||
_mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
|
||||
|
||||
#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
|
||||
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
|
||||
#define MAJy(X, Y, Z) \
|
||||
_mm_or_si64( _mm_and_si64( X, Y ), \
|
||||
_mm_and_si64( _mm_or_si64( X, Y ), Z ) )
|
||||
|
||||
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
|
||||
|
||||
#define BSG2_0y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
|
||||
|
||||
#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
|
||||
|
||||
#define BSG2_1y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
|
||||
|
||||
#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) )
|
||||
|
||||
#define SSG2_0y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
|
||||
|
||||
#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
|
||||
|
||||
#define SSG2_1y(x) \
|
||||
_mm_xor_si64( _mm_xor_si64( \
|
||||
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
|
||||
|
||||
#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) )
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
|
||||
|
||||
#define SHA2y_MEXP( a, b, c, d ) \
|
||||
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
|
||||
|
||||
#define SHA2z_MEXP( a, b, c, d ) \
|
||||
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
|
||||
|
||||
|
||||
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
|
||||
do { \
|
||||
__m256i T1x, T2x; \
|
||||
__m64 T1y, T2y; \
|
||||
uint32_t T1z, T2z; \
|
||||
T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
|
||||
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
|
||||
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
|
||||
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
|
||||
_mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
|
||||
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
|
||||
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
|
||||
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
|
||||
T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
|
||||
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
|
||||
Dx = _mm256_add_epi32( Dx, T1x ); \
|
||||
Dy = _mm_add_pi32( Dy, T1y ); \
|
||||
Dz = Dz + T1z; \
|
||||
Hx = _mm256_add_epi32( T1x, T2x ); \
|
||||
Hy = _mm_add_pi32( T1y, T2y ); \
|
||||
Hz = T1z + T2z; \
|
||||
} while (0)
|
||||
|
||||
void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
|
||||
uint32_t *inz, uint32_t rz[8] )
|
||||
{
|
||||
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
|
||||
__m256i Wx[16];
|
||||
__m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
|
||||
__m64 Wy[16];
|
||||
uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
|
||||
uint32_t Wz[16];
|
||||
|
||||
Wx[ 0] = mm256_bswap_32( inx[ 0] );
|
||||
Wy[ 0] = mm64_bswap_32( iny[ 0] );
|
||||
Wz[ 0] = bswap_32( inz[ 0] );
|
||||
|
||||
Wx[ 1] = mm256_bswap_32( inx[ 1] );
|
||||
Wy[ 1] = mm64_bswap_32( iny[ 1] );
|
||||
Wz[ 1] = bswap_32( inz[ 1] );
|
||||
|
||||
Wx[ 2] = mm256_bswap_32( inx[ 2] );
|
||||
Wy[ 2] = mm64_bswap_32( iny[ 2] );
|
||||
Wz[ 2] = bswap_32( inz[ 2] );
|
||||
|
||||
Wx[ 3] = mm256_bswap_32( inx[ 3] );
|
||||
Wy[ 3] = mm64_bswap_32( iny[ 3] );
|
||||
Wz[ 3] = bswap_32( inz[ 3] );
|
||||
|
||||
Wx[ 4] = mm256_bswap_32( inx[ 4] );
|
||||
Wy[ 4] = mm64_bswap_32( iny[ 4] );
|
||||
Wz[ 4] = bswap_32( inz[ 4] );
|
||||
|
||||
Wx[ 5] = mm256_bswap_32( inx[ 5] );
|
||||
Wy[ 5] = mm64_bswap_32( iny[ 5] );
|
||||
Wz[ 5] = bswap_32( inz[ 5] );
|
||||
|
||||
Wx[ 6] = mm256_bswap_32( inx[ 6] );
|
||||
Wy[ 6] = mm64_bswap_32( iny[ 6] );
|
||||
Wz[ 6] = bswap_32( inz[ 6] );
|
||||
|
||||
Wx[ 7] = mm256_bswap_32( inx[ 7] );
|
||||
Wy[ 7] = mm64_bswap_32( iny[ 7] );
|
||||
Wz[ 7] = bswap_32( inz[ 7] );
|
||||
|
||||
Wx[ 8] = mm256_bswap_32( inx[ 8] );
|
||||
Wy[ 8] = mm64_bswap_32( iny[ 8] );
|
||||
Wz[ 8] = bswap_32( inz[ 8] );
|
||||
|
||||
Wx[ 9] = mm256_bswap_32( inx[ 9] );
|
||||
Wy[ 9] = mm64_bswap_32( iny[ 9] );
|
||||
Wz[ 9] = bswap_32( inz[ 9] );
|
||||
|
||||
Wx[10] = mm256_bswap_32( inx[10] );
|
||||
Wy[10] = mm64_bswap_32( iny[10] );
|
||||
Wz[10] = bswap_32( inz[10] );
|
||||
|
||||
Wx[11] = mm256_bswap_32( inx[11] );
|
||||
Wy[11] = mm64_bswap_32( iny[11] );
|
||||
Wz[11] = bswap_32( inz[11] );
|
||||
|
||||
Wx[12] = mm256_bswap_32( inx[12] );
|
||||
Wy[12] = mm64_bswap_32( iny[12] );
|
||||
Wz[12] = bswap_32( inz[12] );
|
||||
|
||||
Wx[13] = mm256_bswap_32( inx[13] );
|
||||
Wy[13] = mm64_bswap_32( iny[13] );
|
||||
Wz[13] = bswap_32( inz[13] );
|
||||
|
||||
Wx[14] = mm256_bswap_32( inx[14] );
|
||||
Wy[14] = mm64_bswap_32( iny[14] );
|
||||
Wz[14] = bswap_32( inz[14] );
|
||||
|
||||
Wx[15] = mm256_bswap_32( inx[15] );
|
||||
Wy[15] = mm64_bswap_32( iny[15] );
|
||||
Wz[15] = bswap_32( inz[15] );
|
||||
|
||||
Ax = rx[0]; Ay = ry[0]; Az = rz[0];
|
||||
Bx = rx[1]; By = ry[1]; Bz = rz[1];
|
||||
Cx = rx[2]; Cy = ry[2]; Cz = rz[2];
|
||||
Dx = rx[3]; Dy = ry[3]; Dz = rz[3];
|
||||
Ex = rx[4]; Ey = ry[4]; Ez = rz[4];
|
||||
Fx = rx[5]; Fy = ry[5]; Fz = rz[5];
|
||||
Gx = rx[6]; Gy = ry[6]; Gz = rz[6];
|
||||
Hx = rx[7]; Hy = ry[7]; Hz = rz[7];
|
||||
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 );
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
|
||||
Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 );
|
||||
Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 );
|
||||
|
||||
Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
|
||||
Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 );
|
||||
Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 );
|
||||
|
||||
Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
|
||||
Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 );
|
||||
Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 );
|
||||
|
||||
Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
|
||||
Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 );
|
||||
Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 );
|
||||
|
||||
Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
|
||||
Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 );
|
||||
Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 );
|
||||
|
||||
Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
|
||||
Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 );
|
||||
Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 );
|
||||
|
||||
Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
|
||||
Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 );
|
||||
Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 );
|
||||
|
||||
Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7);
|
||||
Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7);
|
||||
Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7);
|
||||
|
||||
Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8);
|
||||
Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8);
|
||||
Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8);
|
||||
|
||||
Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
|
||||
Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9);
|
||||
Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9);
|
||||
|
||||
Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 );
|
||||
Wy[10] = SHA2y_MEXP( 8, 3, 11, 10);
|
||||
Wz[10] = SHA2z_MEXP( 8, 3, 11, 10);
|
||||
|
||||
Wx[11] = SHA2x_MEXP( 9, 4, 12, 11);
|
||||
Wy[11] = SHA2y_MEXP( 9, 4, 12, 11);
|
||||
Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 );
|
||||
|
||||
Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 );
|
||||
Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 );
|
||||
Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 );
|
||||
|
||||
Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 );
|
||||
Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 );
|
||||
Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 );
|
||||
|
||||
Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 );
|
||||
Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 );
|
||||
Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 );
|
||||
|
||||
Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 );
|
||||
Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 );
|
||||
Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j );
|
||||
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
|
||||
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
|
||||
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j );
|
||||
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
|
||||
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
|
||||
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j );
|
||||
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
|
||||
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
|
||||
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
|
||||
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
|
||||
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
|
||||
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
|
||||
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
|
||||
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
|
||||
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
|
||||
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
|
||||
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
|
||||
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
|
||||
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
|
||||
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
|
||||
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
|
||||
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
|
||||
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
|
||||
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
|
||||
}
|
||||
|
||||
rx[0] = _mm256_add_epi32( rx[0], Ax );
|
||||
ry[0] = _mm_add_pi32( ry[0], Ay );
|
||||
rz[0] = rz[0]+ Az;
|
||||
rx[1] = _mm256_add_epi32( rx[1], Bx );
|
||||
ry[1] = _mm_add_pi32( ry[1], By );
|
||||
rz[1] = rz[1]+ Bz;
|
||||
rx[2] = _mm256_add_epi32( rx[2], Cx );
|
||||
ry[2] = _mm_add_pi32( ry[2], Cy );
|
||||
rz[3] = rz[3]+ Dz;
|
||||
rx[4] = _mm256_add_epi32( rx[4], Ex );
|
||||
ry[4] = _mm_add_pi32( ry[4], Ey );
|
||||
rz[4] = rz[4]+ Ez;
|
||||
rx[5] = _mm256_add_epi32( rx[5], Fx );
|
||||
ry[5] = _mm_add_pi32( ry[5], Fy );
|
||||
rz[5] = rz[5]+ Fz;
|
||||
rx[6] = _mm256_add_epi32( rx[6], Gx );
|
||||
ry[6] = _mm_add_pi32( ry[6], Gy );
|
||||
rz[6] = rz[6]+ Gz;
|
||||
rx[7] = _mm256_add_epi32( rx[7], Hx );
|
||||
ry[7] = _mm_add_pi32( ry[7], Hy );
|
||||
rz[7] = rz[7]+ Hz;
|
||||
|
||||
}
|
||||
|
||||
void sha256_11way_init( sha256_11way_context *ctx )
|
||||
{
|
||||
ctx->count_high = ctx->count_low = 0;
|
||||
ctx->valx[0] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[0] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[1] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[1] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[2] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[2] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[3] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[3] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[4] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[4] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[5] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[5] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[6] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[6] = _mm_set1_pi32( H256[0] );
|
||||
ctx->valx[7] = _mm256_set1_epi32( H256[0] );
|
||||
ctx->valy[7] = _mm_set1_pi32( H256[0] );
|
||||
memcpy( ctx->valz, H256, 32 );
|
||||
}
|
||||
|
||||
|
||||
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
|
||||
const void *datay, const void *dataz, size_t len )
|
||||
{
|
||||
__m256i *vdatax = (__m256i*) datax;
|
||||
__m64 *vdatay = (__m64*) datay;
|
||||
uint32_t *idataz = (uint32_t*)dataz;
|
||||
size_t ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
|
||||
memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
|
||||
memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = ctx->count_low;
|
||||
clow2 = clow + clen;
|
||||
ctx->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
ctx->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
|
||||
void *dstz)
|
||||
{
|
||||
unsigned ptr, u;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
|
||||
ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
|
||||
ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
|
||||
ctx->bufz[ ptr>>2 ] = 0x80;
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
memset_zero_256( ctx->bufx, pad >> 2 );
|
||||
memset_zero_m64( ctx->bufy, pad >> 2 );
|
||||
memset( ctx->bufz, 0, pad >> 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
|
||||
memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
|
||||
memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
|
||||
}
|
||||
|
||||
low = ctx->count_low;
|
||||
high = (ctx->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
ctx->bufx[ pad >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( high ) );
|
||||
ctx->bufy[ pad >> 2 ] =
|
||||
mm64_bswap_32( _mm_set1_pi32( high ) );
|
||||
ctx->bufz[ pad >> 2 ] =
|
||||
bswap_32( high );
|
||||
|
||||
|
||||
ctx->bufx[ ( pad+4 ) >> 2 ] =
|
||||
mm256_bswap_32( _mm256_set1_epi32( low ) );
|
||||
ctx->bufy[ ( pad+4 ) >> 2 ] =
|
||||
mm64_bswap_32( _mm_set1_pi32( low ) );
|
||||
ctx->bufz[ ( pad+4 ) >> 2 ] =
|
||||
bswap_32( low );
|
||||
|
||||
sha256_11way_round( ctx->bufx, ctx->valx,
|
||||
ctx->bufy, ctx->valy,
|
||||
ctx->bufz, ctx->valz );
|
||||
|
||||
for ( u = 0; u < 8; u ++ )
|
||||
{
|
||||
casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
|
||||
casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] );
|
||||
((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // 0
|
@@ -5,137 +5,6 @@
|
||||
#include <stdio.h>
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(SHA256T_11WAY)
|
||||
|
||||
static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
|
||||
const void *inpy, const void*inpz )
|
||||
{
|
||||
uint32_t hashx[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hashy[8*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hashz[8] __attribute__ ((aligned (64)));
|
||||
sha256_11way_context ctx;
|
||||
const void *inpx64 = inpx+(64<<3);
|
||||
const void *inpy64 = inpy+(64<<1);
|
||||
const void *inpz64 = inpz+ 64;
|
||||
|
||||
memcpy( &ctx, &sha256_ctx11, sizeof ctx );
|
||||
sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 );
|
||||
sha256_11way_close( &ctx, hashx, hashy, hashz );
|
||||
|
||||
sha256_11way_init( &ctx );
|
||||
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
|
||||
sha256_11way_close( &ctx, hashx, hashy, hashz );
|
||||
|
||||
sha256_11way_init( &ctx );
|
||||
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
|
||||
sha256_11way_close( &ctx, outx, outy, outz );
|
||||
}
|
||||
|
||||
int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t datax[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t datay[20*2] __attribute__ ((aligned (32)));
|
||||
uint32_t dataz[20] __attribute__ ((aligned (32)));
|
||||
uint32_t hashx[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashy[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hashz[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncex = (__m256i*) datax + 19;
|
||||
__m64 *noncey = (__m64*) datay + 19;
|
||||
uint32_t *noncez = (uint32_t*)dataz + 19;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int i;
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
|
||||
// Use dataz (scalar) to stage bswapped data for the vectors.
|
||||
casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
intrlv_8x32( datax, dataz, dataz, dataz, dataz,
|
||||
dataz, dataz, dataz, dataz, 640 );
|
||||
mm64_interleave_2x32( datay, dataz, dataz, 640 );
|
||||
|
||||
sha256_11way_init( &sha256_ctx11 );
|
||||
sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
*noncex = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
*noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
|
||||
*noncez = bswap_32( n+10 );
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
|
||||
|
||||
if ( opt_benchmark ) { n += 11; continue; }
|
||||
|
||||
hash7 = &(hashx[7<<3]);
|
||||
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
extr_lane_8x32( lane_hash, hashx, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
|
||||
hash7 = &(hashy[7<<1]);
|
||||
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
|
||||
|
||||
{
|
||||
mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + 8 + i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i+8 );
|
||||
}
|
||||
}
|
||||
|
||||
if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
|
||||
{
|
||||
pdata[19] = n+10;
|
||||
submit_lane_solution( work, hashz, mythr, 10 );
|
||||
}
|
||||
n += 11;
|
||||
|
||||
} while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
|
@@ -15,7 +15,6 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->hash = (void*)&sha256t_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -34,7 +33,6 @@ bool register_sha256q_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_sha256q;
|
||||
gate->hash = (void*)&sha256q_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@@ -252,16 +252,6 @@ void sha512_4way_init( sha512_4way_context *sc )
|
||||
{
|
||||
sc->initialized = false;
|
||||
sc->count = 0;
|
||||
/*
|
||||
sc->val[0] = _mm256_set1_epi64x( H512[0] );
|
||||
sc->val[1] = _mm256_set1_epi64x( H512[1] );
|
||||
sc->val[2] = _mm256_set1_epi64x( H512[2] );
|
||||
sc->val[3] = _mm256_set1_epi64x( H512[3] );
|
||||
sc->val[4] = _mm256_set1_epi64x( H512[4] );
|
||||
sc->val[5] = _mm256_set1_epi64x( H512[5] );
|
||||
sc->val[6] = _mm256_set1_epi64x( H512[6] );
|
||||
sc->val[7] = _mm256_set1_epi64x( H512[7] );
|
||||
*/
|
||||
}
|
||||
|
||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
||||
@@ -295,6 +285,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
|
||||
0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
|
||||
@@ -308,10 +302,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
else
|
||||
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] =
|
||||
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
|
||||
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm256_block_bswap_64( dst, sc->val );
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
|
||||
static const uint32_t IV512[] =
|
||||
{
|
||||
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
|
||||
@@ -13,6 +14,7 @@ static const uint32_t IV512[] =
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
|
||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
|
||||
mm256_ror1x32_128( b ), 0x88 )
|
||||
@@ -232,18 +234,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
|
||||
void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
{
|
||||
casti_m256i( ctx->h, 0 ) =
|
||||
_mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0],
|
||||
IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );
|
||||
casti_m256i( ctx->h, 1 ) =
|
||||
_mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4],
|
||||
IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] );
|
||||
casti_m256i( ctx->h, 2 ) =
|
||||
_mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
|
||||
IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
|
||||
casti_m256i( ctx->h, 3 ) =
|
||||
_mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
|
||||
IV512[15], IV512[14], IV512[13], IV512[12] );
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m256_const1_128( iv[0] );
|
||||
h[1] = m256_const1_128( iv[1] );
|
||||
h[2] = m256_const1_128( iv[2] );
|
||||
h[3] = m256_const1_128( iv[3] );
|
||||
|
||||
ctx->ptr = 0;
|
||||
ctx->count0 = 0;
|
||||
ctx->count1 = 0;
|
||||
@@ -251,6 +249,7 @@ void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
ctx->count3 = 0;
|
||||
}
|
||||
|
||||
// not tested, use update_close
|
||||
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
@@ -287,6 +286,7 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
|
||||
ctx->ptr = ptr;
|
||||
}
|
||||
|
||||
// not tested
|
||||
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
{
|
||||
unsigned char *buf;
|
||||
@@ -300,7 +300,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
uint32_t vp = ctx->ptr>>5;
|
||||
|
||||
// Terminating byte then zero pad
|
||||
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
|
||||
|
||||
// Zero pad full vectors up to count
|
||||
for ( ; vp < 6; vp++ )
|
||||
@@ -314,14 +314,12 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
|
||||
count.u16[0], 0,0,0,0,0,0,0 );
|
||||
casti_m256i( buf, 7 ) = _mm256_set_epi16(
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
|
||||
|
||||
casti_m256i( buf, 6 ) = m256_const1_128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
c512_2way( ctx, buf);
|
||||
|
||||
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
|
||||
@@ -382,23 +380,21 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
|
||||
casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
|
||||
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
|
||||
count.u16[0], 0,0,0,0,0,0,0 );
|
||||
casti_m256i( buf, 7 ) = _mm256_set_epi16(
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
|
||||
0x0200 , count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
|
||||
casti_m256i( buf, 6 ) = m256_const1_128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
c512_2way( ctx, buf);
|
||||
|
||||
|
@@ -83,13 +83,14 @@ HashReturn init_sd(hashState_sd *state, int hashbitlen) {
|
||||
char *init;
|
||||
|
||||
#ifndef NO_PRECOMPUTED_IV
|
||||
if (hashbitlen == 224)
|
||||
r=InitIV(state, hashbitlen, IV_224);
|
||||
else if (hashbitlen == 256)
|
||||
r=InitIV(state, hashbitlen, IV_256);
|
||||
else if (hashbitlen == 384)
|
||||
r=InitIV(state, hashbitlen, IV_384);
|
||||
else if (hashbitlen == 512)
|
||||
// if (hashbitlen == 224)
|
||||
// r=InitIV(state, hashbitlen, IV_224);
|
||||
// else if (hashbitlen == 256)
|
||||
// r=InitIV(state, hashbitlen, IV_256);
|
||||
// else if (hashbitlen == 384)
|
||||
// r=InitIV(state, hashbitlen, IV_384);
|
||||
// else
|
||||
if (hashbitlen == 512)
|
||||
r=InitIV(state, hashbitlen, IV_512);
|
||||
else
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -7,15 +7,37 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
|
||||
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
|
||||
uint32_t A[ 32*4 ];
|
||||
uint8_t buffer[ 128*4 ];
|
||||
uint64_t count;
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
} simd_4way_context __attribute__((aligned(128)));
|
||||
|
||||
int simd_4way_init( simd_4way_context *state, int hashbitlen );
|
||||
int simd_4way_update( simd_4way_context *state, const void *data,
|
||||
int databitlen );
|
||||
int simd_4way_close( simd_4way_context *state, void *hashval );
|
||||
int simd_4way_update_close( simd_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ];
|
||||
uint8_t buffer[ 128*2 ];
|
||||
uint64_t count;
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
} simd_2way_context;
|
||||
} simd_2way_context __attribute__((aligned(128)));
|
||||
|
||||
int simd_2way_init( simd_2way_context *state, int hashbitlen );
|
||||
int simd_2way_update( simd_2way_context *state, const void *data,
|
||||
|
@@ -2,17 +2,140 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "skein-hash-4way.h"
|
||||
|
||||
// 8 way is faster than SHA on Icelake
|
||||
// SHA is faster than 4 way on Ryzen
|
||||
//
|
||||
#if defined(__SHA__)
|
||||
#include <openssl/sha.h>
|
||||
#else
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#endif
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
|
||||
#if defined (SKEIN_4WAY)
|
||||
#if defined (SKEIN_8WAY)
|
||||
|
||||
void skeinhash_8way( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
|
||||
skein512_8way_context ctx_skein;
|
||||
|
||||
//#if defined(__SHA__)
|
||||
// uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash4[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash5[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash6[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash7[16] __attribute__ ((aligned (64)));
|
||||
// SHA256_CTX ctx_sha256;
|
||||
//#else
|
||||
uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
|
||||
sha256_8way_context ctx_sha256;
|
||||
//#endif
|
||||
|
||||
skein512_8way_init( &ctx_skein );
|
||||
skein512_8way_update( &ctx_skein, input, 80 );
|
||||
skein512_8way_close( &ctx_skein, vhash64 );
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash64, 512 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
|
||||
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
|
||||
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
|
||||
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
|
||||
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
|
||||
SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
|
||||
SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
|
||||
SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
|
||||
SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
|
||||
|
||||
intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
#else
|
||||
*/
|
||||
|
||||
rintrlv_8x64_8x32( vhash32, vhash64, 512 );
|
||||
// dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
// vhash64, 512 );
|
||||
// intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
// hash7, 512 );
|
||||
|
||||
sha256_8way_init( &ctx_sha256 );
|
||||
sha256_8way( &ctx_sha256, vhash32, 64 );
|
||||
sha256_8way_close( &ctx_sha256, state );
|
||||
//#endif
|
||||
}
|
||||
|
||||
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
skeinhash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (SKEIN_4WAY)
|
||||
|
||||
void skeinhash_4way( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[16*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
|
||||
skein512_4way_context ctx_skein;
|
||||
#if defined(__SHA__)
|
||||
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
@@ -26,7 +149,7 @@ void skeinhash_4way( void *state, const void *input )
|
||||
#endif
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way( &ctx_skein, input, 80 );
|
||||
skein512_4way_update( &ctx_skein, input, 80 );
|
||||
skein512_4way_close( &ctx_skein, vhash64 );
|
||||
|
||||
#if defined(__SHA__)
|
||||
@@ -71,7 +194,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
do
|
||||
@@ -92,9 +215,9 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -2,19 +2,36 @@
|
||||
#include "sph_skein.h"
|
||||
#include "skein-hash-4way.h"
|
||||
|
||||
int64_t skein_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_skein_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | SHA_OPT;
|
||||
#if defined (SKEIN_4WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein_8way;
|
||||
gate->hash = (void*)&skeinhash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&skein_get_max64;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_skein2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_8way;
|
||||
gate->hash = (void*)&skein2hash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_4way;
|
||||
gate->hash = (void*)&skein2hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein2;
|
||||
gate->hash = (void*)&skein2hash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
|
@@ -1,23 +1,44 @@
|
||||
#ifndef __SKEIN_GATE_H__
|
||||
#define __SKEIN_GATE_H__
|
||||
#define __SKEIN_GATE_H__ 1
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define SKEIN_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SKEIN_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SKEIN_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SKEIN_4WAY)
|
||||
#if defined(SKEIN_8WAY)
|
||||
|
||||
void skeinhash_8way( void *output, const void *input );
|
||||
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void skein2hash_8way( void *output, const void *input );
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
void skeinhash_4way( void *output, const void *input );
|
||||
|
||||
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void skein2hash_4way( void *output, const void *input );
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void skeinhash( void *output, const void *input );
|
||||
|
||||
int scanhash_skein( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void skein2hash( void *output, const void *input );
|
||||
int scanhash_skein2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -36,7 +36,6 @@
|
||||
#include <string.h>
|
||||
#include "skein-hash-4way.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -45,6 +44,22 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
static const sph_u64 IV256[] = {
|
||||
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
|
||||
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
|
||||
SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
|
||||
SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
|
||||
};
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
|
||||
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
|
||||
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
|
||||
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
|
||||
};
|
||||
*/
|
||||
|
||||
/*
|
||||
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
|
||||
*/
|
||||
@@ -270,8 +285,151 @@ extern "C"{
|
||||
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
|
||||
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
|
||||
|
||||
#define READ_STATE_BIG(sc) do { \
|
||||
h0 = (sc)->h0; \
|
||||
h1 = (sc)->h1; \
|
||||
h2 = (sc)->h2; \
|
||||
h3 = (sc)->h3; \
|
||||
h4 = (sc)->h4; \
|
||||
h5 = (sc)->h5; \
|
||||
h6 = (sc)->h6; \
|
||||
h7 = (sc)->h7; \
|
||||
bcount = sc->bcount; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE_BIG(sc) do { \
|
||||
(sc)->h0 = h0; \
|
||||
(sc)->h1 = h1; \
|
||||
(sc)->h2 = h2; \
|
||||
(sc)->h3 = h3; \
|
||||
(sc)->h4 = h4; \
|
||||
(sc)->h5 = h5; \
|
||||
(sc)->h6 = h6; \
|
||||
(sc)->h7 = h7; \
|
||||
sc->bcount = bcount; \
|
||||
} while (0)
|
||||
|
||||
// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
|
||||
do { \
|
||||
k8 = _mm512_xor_si512( _mm512_xor_si512( \
|
||||
_mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
|
||||
_mm512_xor_si512( k2, k3 ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
|
||||
_mm512_xor_si512( k6, k7 ) ) ), \
|
||||
m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
|
||||
t2 = t0 ^ t1; \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
|
||||
do { \
|
||||
w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
|
||||
w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
|
||||
w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
|
||||
w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
|
||||
w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
|
||||
w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
|
||||
m512_const1_64( SKBT(t,s,0) ) ) ); \
|
||||
w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
|
||||
m512_const1_64( SKBT(t,s,1) ) ) ); \
|
||||
w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
|
||||
m512_const1_64( s ) ) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define TFBIG_MIX_8WAY(x0, x1, rc) \
|
||||
do { \
|
||||
x0 = _mm512_add_epi64( x0, x1 ); \
|
||||
x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
|
||||
TFBIG_MIX_8WAY(w0, w1, rc0); \
|
||||
TFBIG_MIX_8WAY(w2, w3, rc1); \
|
||||
TFBIG_MIX_8WAY(w4, w5, rc2); \
|
||||
TFBIG_MIX_8WAY(w6, w7, rc3); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_8WAY_4e(s) do { \
|
||||
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
|
||||
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
|
||||
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
|
||||
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_8WAY_4o(s) do { \
|
||||
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
|
||||
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
|
||||
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
|
||||
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
|
||||
} while (0)
|
||||
|
||||
#define UBI_BIG_8WAY(etype, extra) \
|
||||
do { \
|
||||
sph_u64 t0, t1, t2; \
|
||||
__m512i h8; \
|
||||
__m512i m0 = buf[0]; \
|
||||
__m512i m1 = buf[1]; \
|
||||
__m512i m2 = buf[2]; \
|
||||
__m512i m3 = buf[3]; \
|
||||
__m512i m4 = buf[4]; \
|
||||
__m512i m5 = buf[5]; \
|
||||
__m512i m6 = buf[6]; \
|
||||
__m512i m7 = buf[7]; \
|
||||
\
|
||||
__m512i p0 = m0; \
|
||||
__m512i p1 = m1; \
|
||||
__m512i p2 = m2; \
|
||||
__m512i p3 = m3; \
|
||||
__m512i p4 = m4; \
|
||||
__m512i p5 = m5; \
|
||||
__m512i p6 = m6; \
|
||||
__m512i p7 = m7; \
|
||||
t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
|
||||
t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
|
||||
TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
|
||||
TFBIG_8WAY_4e(0); \
|
||||
TFBIG_8WAY_4o(1); \
|
||||
TFBIG_8WAY_4e(2); \
|
||||
TFBIG_8WAY_4o(3); \
|
||||
TFBIG_8WAY_4e(4); \
|
||||
TFBIG_8WAY_4o(5); \
|
||||
TFBIG_8WAY_4e(6); \
|
||||
TFBIG_8WAY_4o(7); \
|
||||
TFBIG_8WAY_4e(8); \
|
||||
TFBIG_8WAY_4o(9); \
|
||||
TFBIG_8WAY_4e(10); \
|
||||
TFBIG_8WAY_4o(11); \
|
||||
TFBIG_8WAY_4e(12); \
|
||||
TFBIG_8WAY_4o(13); \
|
||||
TFBIG_8WAY_4e(14); \
|
||||
TFBIG_8WAY_4o(15); \
|
||||
TFBIG_8WAY_4e(16); \
|
||||
TFBIG_8WAY_4o(17); \
|
||||
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
|
||||
h0 = _mm512_xor_si512( m0, p0 );\
|
||||
h1 = _mm512_xor_si512( m1, p1 );\
|
||||
h2 = _mm512_xor_si512( m2, p2 );\
|
||||
h3 = _mm512_xor_si512( m3, p3 );\
|
||||
h4 = _mm512_xor_si512( m4, p4 );\
|
||||
h5 = _mm512_xor_si512( m5, p5 );\
|
||||
h6 = _mm512_xor_si512( m6, p6 );\
|
||||
h7 = _mm512_xor_si512( m7, p7 );\
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE_BIG_8WAY \
|
||||
__m512i h0, h1, h2, h3, h4, h5, h6, h7; \
|
||||
sph_u64 bcount;
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
|
||||
do { \
|
||||
k8 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -298,39 +456,34 @@ do { \
|
||||
m256_const1_64( s ) ) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define TFBIG_MIX_4WAY(x0, x1, rc) \
|
||||
do { \
|
||||
x0 = _mm256_add_epi64( x0, x1 ); \
|
||||
x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
// typeless
|
||||
#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
|
||||
TFBIG_MIX_4WAY(w0, w1, rc0); \
|
||||
TFBIG_MIX_4WAY(w2, w3, rc1); \
|
||||
TFBIG_MIX_4WAY(w4, w5, rc2); \
|
||||
TFBIG_MIX_4WAY(w6, w7, rc3); \
|
||||
} while (0)
|
||||
#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
|
||||
TFBIG_MIX_4WAY(w0, w1, rc0); \
|
||||
TFBIG_MIX_4WAY(w2, w3, rc1); \
|
||||
TFBIG_MIX_4WAY(w4, w5, rc2); \
|
||||
TFBIG_MIX_4WAY(w6, w7, rc3); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4WAY_4e(s) do { \
|
||||
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
|
||||
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
|
||||
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
|
||||
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4e(s) do { \
|
||||
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
|
||||
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
|
||||
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
|
||||
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4o(s) do { \
|
||||
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
|
||||
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
|
||||
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
|
||||
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4WAY_4o(s) do { \
|
||||
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
|
||||
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
|
||||
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
|
||||
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
|
||||
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
|
||||
} while (0)
|
||||
|
||||
// scale buf offset by 4
|
||||
#define UBI_BIG_4WAY(etype, extra) \
|
||||
@@ -357,24 +510,24 @@ do { \
|
||||
t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
|
||||
t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
|
||||
TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
|
||||
TFBIG_4e(0); \
|
||||
TFBIG_4o(1); \
|
||||
TFBIG_4e(2); \
|
||||
TFBIG_4o(3); \
|
||||
TFBIG_4e(4); \
|
||||
TFBIG_4o(5); \
|
||||
TFBIG_4e(6); \
|
||||
TFBIG_4o(7); \
|
||||
TFBIG_4e(8); \
|
||||
TFBIG_4o(9); \
|
||||
TFBIG_4e(10); \
|
||||
TFBIG_4o(11); \
|
||||
TFBIG_4e(12); \
|
||||
TFBIG_4o(13); \
|
||||
TFBIG_4e(14); \
|
||||
TFBIG_4o(15); \
|
||||
TFBIG_4e(16); \
|
||||
TFBIG_4o(17); \
|
||||
TFBIG_4WAY_4e(0); \
|
||||
TFBIG_4WAY_4o(1); \
|
||||
TFBIG_4WAY_4e(2); \
|
||||
TFBIG_4WAY_4o(3); \
|
||||
TFBIG_4WAY_4e(4); \
|
||||
TFBIG_4WAY_4o(5); \
|
||||
TFBIG_4WAY_4e(6); \
|
||||
TFBIG_4WAY_4o(7); \
|
||||
TFBIG_4WAY_4e(8); \
|
||||
TFBIG_4WAY_4o(9); \
|
||||
TFBIG_4WAY_4e(10); \
|
||||
TFBIG_4WAY_4o(11); \
|
||||
TFBIG_4WAY_4e(12); \
|
||||
TFBIG_4WAY_4o(13); \
|
||||
TFBIG_4WAY_4e(14); \
|
||||
TFBIG_4WAY_4o(15); \
|
||||
TFBIG_4WAY_4e(16); \
|
||||
TFBIG_4WAY_4o(17); \
|
||||
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
|
||||
h0 = _mm256_xor_si256( m0, p0 );\
|
||||
h1 = _mm256_xor_si256( m1, p1 );\
|
||||
@@ -391,45 +544,142 @@ do { \
|
||||
__m256i h0, h1, h2, h3, h4, h5, h6, h7; \
|
||||
sph_u64 bcount;
|
||||
|
||||
#define READ_STATE_BIG(sc) do { \
|
||||
h0 = (sc)->h0; \
|
||||
h1 = (sc)->h1; \
|
||||
h2 = (sc)->h2; \
|
||||
h3 = (sc)->h3; \
|
||||
h4 = (sc)->h4; \
|
||||
h5 = (sc)->h5; \
|
||||
h6 = (sc)->h6; \
|
||||
h7 = (sc)->h7; \
|
||||
bcount = sc->bcount; \
|
||||
} while (0)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define WRITE_STATE_BIG(sc) do { \
|
||||
(sc)->h0 = h0; \
|
||||
(sc)->h1 = h1; \
|
||||
(sc)->h2 = h2; \
|
||||
(sc)->h3 = h3; \
|
||||
(sc)->h4 = h4; \
|
||||
(sc)->h5 = h5; \
|
||||
(sc)->h6 = h6; \
|
||||
(sc)->h7 = h7; \
|
||||
sc->bcount = bcount; \
|
||||
} while (0)
|
||||
void skein256_8way_init( skein256_8way_context *sc )
|
||||
{
|
||||
sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
|
||||
sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
|
||||
sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
|
||||
sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
|
||||
sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
|
||||
sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
|
||||
sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
|
||||
sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
|
||||
sc->bcount = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
static const sph_u64 IV256[] = {
|
||||
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
|
||||
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
|
||||
SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
|
||||
SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
|
||||
};
|
||||
void skein512_8way_init( skein512_8way_context *sc )
|
||||
{
|
||||
sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
|
||||
sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
|
||||
sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
|
||||
sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
|
||||
sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
|
||||
sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
|
||||
sc->h6 = m512_const1_64( 0x991112C71A75B523 );
|
||||
sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
|
||||
sc->bcount = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
skein_big_core_8way( skein512_8way_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
unsigned first;
|
||||
DECL_STATE_BIG_8WAY
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
const int buf_size = 64; // 64 * _m256i
|
||||
|
||||
if ( len <= buf_size - ptr )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
sc->ptr = ptr + len;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE_BIG( sc );
|
||||
first = ( bcount == 0 ) << 7;
|
||||
do {
|
||||
size_t clen;
|
||||
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
bcount ++;
|
||||
UBI_BIG_8WAY( 96 + first, 0 );
|
||||
first = 0;
|
||||
ptr = 0;
|
||||
}
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>3);
|
||||
len -= clen;
|
||||
} while ( len > 0 );
|
||||
WRITE_STATE_BIG( sc );
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_len )
|
||||
{
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
unsigned et;
|
||||
DECL_STATE_BIG_8WAY
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
READ_STATE_BIG(sc);
|
||||
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_8WAY( et, ptr );
|
||||
|
||||
memset_zero_512( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
UBI_BIG_8WAY( 510, 8 );
|
||||
|
||||
buf[0] = h0;
|
||||
buf[1] = h1;
|
||||
buf[2] = h2;
|
||||
buf[3] = h3;
|
||||
buf[4] = h4;
|
||||
buf[5] = h5;
|
||||
buf[6] = h6;
|
||||
buf[7] = h7;
|
||||
|
||||
memcpy_512( dst, buf, out_len >> 3 );
|
||||
}
|
||||
|
||||
void
|
||||
skein256_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
skein_big_core_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
skein256_8way_close(void *cc, void *dst)
|
||||
{
|
||||
skein_big_close_8way(cc, 0, 0, dst, 32);
|
||||
}
|
||||
|
||||
void
|
||||
skein512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
skein_big_core_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
skein512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
skein_big_close_8way(cc, 0, 0, dst, 64);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
|
||||
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
|
||||
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
|
||||
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
|
||||
};
|
||||
*/
|
||||
|
||||
void skein256_4way_init( skein256_4way_context *sc )
|
||||
{
|
||||
@@ -517,66 +767,30 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
|
||||
ptr = sc->ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
/*
|
||||
* At that point, if ptr == 0, then the message was empty;
|
||||
* otherwise, there is between 1 and 64 bytes (inclusive) which
|
||||
* are yet to be processed. Either way, we complete the buffer
|
||||
* to a full block with zeros (the Skein specification mandates
|
||||
* that an empty message is padded so that there is at least
|
||||
* one block to process).
|
||||
*
|
||||
* Once this block has been processed, we do it again, with
|
||||
* a block full of zeros, for the output (that block contains
|
||||
* the encoding of "0", over 8 bytes, then padded with zeros).
|
||||
*/
|
||||
|
||||
READ_STATE_BIG(sc);
|
||||
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
|
||||
memset_zero_256( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
UBI_BIG_4WAY( 510, 8 );
|
||||
memset_zero_256( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
UBI_BIG_4WAY( 510, 8 );
|
||||
|
||||
buf[0] = h0;
|
||||
buf[1] = h1;
|
||||
buf[2] = h2;
|
||||
buf[3] = h3;
|
||||
buf[4] = h4;
|
||||
buf[5] = h5;
|
||||
buf[6] = h6;
|
||||
buf[7] = h7;
|
||||
buf[0] = h0;
|
||||
buf[1] = h1;
|
||||
buf[2] = h2;
|
||||
buf[3] = h3;
|
||||
buf[4] = h4;
|
||||
buf[5] = h5;
|
||||
buf[6] = h6;
|
||||
buf[7] = h7;
|
||||
|
||||
memcpy_256( dst, buf, out_len >> 3 );
|
||||
memcpy_256( dst, buf, out_len >> 3 );
|
||||
}
|
||||
|
||||
/*
|
||||
static const sph_u64 IV256[] = {
|
||||
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
|
||||
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
|
||||
SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
|
||||
SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
|
||||
};
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
|
||||
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
|
||||
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
|
||||
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
|
||||
};
|
||||
*/
|
||||
/*
|
||||
void
|
||||
skein256_4way_init(void *cc)
|
||||
{
|
||||
skein_big_init_4way(cc, IV256);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
skein256_4way(void *cc, const void *data, size_t len)
|
||||
skein256_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
skein_big_core_4way(cc, data, len);
|
||||
}
|
||||
@@ -587,16 +801,8 @@ skein256_4way_close(void *cc, void *dst)
|
||||
skein_big_close_4way(cc, 0, 0, dst, 32);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
skein512_4way_init(void *cc)
|
||||
{
|
||||
skein_big_init_4way(cc, IV512);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
skein512_4way(void *cc, const void *data, size_t len)
|
||||
skein512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
skein_big_core_4way(cc, data, len);
|
||||
}
|
||||
|
@@ -55,29 +55,50 @@ extern "C"{
|
||||
#define SPH_SIZE_skein256 256
|
||||
#define SPH_SIZE_skein512 512
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
__m512i buf[8];
|
||||
__m512i h0, h1, h2, h3, h4, h5, h6, h7;
|
||||
size_t ptr;
|
||||
sph_u64 bcount;
|
||||
} sph_skein_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef sph_skein_8way_big_context skein512_8way_context;
|
||||
typedef sph_skein_8way_big_context skein256_8way_context;
|
||||
|
||||
void skein512_8way_init( skein512_8way_context *sc );
|
||||
void skein512_8way_update( void *cc, const void *data, size_t len );
|
||||
void skein512_8way_close( void *cc, void *dst );
|
||||
|
||||
void skein256_8way_init( skein256_8way_context *sc );
|
||||
void skein256_8way_update( void *cc, const void *data, size_t len );
|
||||
void skein256_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[8];
|
||||
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
|
||||
size_t ptr;
|
||||
sph_u64 bcount;
|
||||
} sph_skein_4way_big_context;
|
||||
} sph_skein_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef sph_skein_4way_big_context skein512_4way_context;
|
||||
typedef sph_skein_4way_big_context skein256_4way_context;
|
||||
|
||||
void skein512_4way_init( skein512_4way_context *sc );
|
||||
void skein512_4way( void *cc, const void *data, size_t len );
|
||||
void skein512_4way_update( void *cc, const void *data, size_t len );
|
||||
void skein512_4way_close( void *cc, void *dst );
|
||||
//void sph_skein512_addbits_and_close(
|
||||
// void *cc, unsigned ub, unsigned n, void *dst);
|
||||
#define skein512_4way skein512_4way_update
|
||||
|
||||
void skein256_4way_init( skein256_4way_context *sc );
|
||||
void skein256_4way( void *cc, const void *data, size_t len );
|
||||
void skein256_4way_update( void *cc, const void *data, size_t len );
|
||||
void skein256_4way_close( void *cc, void *dst );
|
||||
//void sph_skein256_addbits_and_close(
|
||||
// void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#define skein256_4way skein256_4way_update
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@@ -1,9 +1,66 @@
|
||||
#include "skein2-gate.h"
|
||||
#include "skein-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "skein-hash-4way.h"
|
||||
|
||||
#if defined(SKEIN2_4WAY)
|
||||
#if defined(SKEIN_8WAY)
|
||||
|
||||
void skein2hash_8way( void *output, const void *input )
|
||||
{
|
||||
skein512_8way_context ctx;
|
||||
uint64_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
|
||||
skein512_8way_init( &ctx );
|
||||
skein512_8way_update( &ctx, input, 80 );
|
||||
skein512_8way_close( &ctx, hash );
|
||||
|
||||
skein512_8way_init( &ctx );
|
||||
skein512_8way_update( &ctx, hash, 64 );
|
||||
skein512_8way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
skein2hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
void skein2hash_4way( void *output, const void *input )
|
||||
{
|
||||
|
@@ -1,23 +0,0 @@
|
||||
#include "skein2-gate.h"
|
||||
#include <stdint.h>
|
||||
#include "sph_skein.h"
|
||||
|
||||
int64_t skein2_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
}
|
||||
|
||||
bool register_skein2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT;
|
||||
#if defined (SKEIN2_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_4way;
|
||||
gate->hash = (void*)&skein2hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein2;
|
||||
gate->hash = (void*)&skein2hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&skein2_get_max64;
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,20 +0,0 @@
|
||||
#ifndef __SKEIN2GATE_H__
|
||||
#define __SKEIN2_GATE_H__
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define SKEIN2_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(SKEIN2_4WAY)
|
||||
void skein2hash_4way( void *output, const void *input );
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
void skein2hash( void *output, const void *input );
|
||||
int scanhash_skein2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user