Compare commits

...

7 Commits

Author SHA1 Message Date
Jay D Dee
72330eb5a7 v3.9.9 2019-10-10 19:58:34 -04:00
Jay D Dee
789c8b70bc v3.9.8.1 2019-10-01 14:17:36 -04:00
Jay D Dee
01550d94a2 v3.9.8 2019-09-26 22:37:26 -04:00
Jay D Dee
a042fb7612 v3.9.7 2019-08-03 10:39:54 -04:00
Jay D Dee
9d49e0be7a v3.9.6.2 2019-07-30 10:16:43 -04:00
Jay D Dee
a51f59086b v3.9.6.1 2019-07-18 19:46:57 -04:00
Jay D Dee
6f49ba09b7 v3.9.6 2019-07-17 17:54:38 -04:00
130 changed files with 10669 additions and 4185 deletions

View File

@@ -18,7 +18,6 @@ dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
cpu-miner.c \
util.c \
uint256.cpp \
api.c \
sysinfos.c \
algo-gate-api.c\
@@ -51,12 +50,15 @@ cpuminer_SOURCES = \
algo/blake/blake.c \
algo/blake/blake-4way.c \
algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \
algo/blake/sph-blake2s.c \
algo/blake/blake2s-hash-4way.c \
algo/blake/blake2s.c \
algo/blake/blake2s-gate.c \
algo/blake/blake2s-4way.c \
algo/blake/blake2b-hash-4way.c \
algo/blake/blake2b.c \
algo/blake/blake2b-gate.c \
algo/blake/blake2b-4way.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \
@@ -71,6 +73,9 @@ cpuminer_SOURCES = \
algo/bmw/bmw256-hash-4way.c \
algo/bmw/bmw512-hash-4way.c \
algo/bmw/bmw256.c \
algo/bmw/bmw512-gate.c \
algo/bmw/bmw512.c \
algo/bmw/bmw512-4way.c \
algo/cryptonight/cryptolight.c \
algo/cryptonight/cryptonight-common.c\
algo/cryptonight/cryptonight-aesni.c\
@@ -166,7 +171,8 @@ cpuminer_SOURCES = \
algo/scryptjane/scrypt-jane.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha2-hash-4way.c \
algo/sha/sha256-hash-4way.c \
algo/sha/sha512-hash-4way.c \
algo/sha/sha256_hash_11way.c \
algo/sha/sha2.c \
algo/sha/sha256t-gate.c \
@@ -238,6 +244,8 @@ cpuminer_SOURCES = \
algo/x13/skunk-4way.c \
algo/x13/skunk.c \
algo/x13/drop.c \
algo/x13/x13bcd-4way.c \
algo/x13/x13bcd.c \
algo/x14/x14-gate.c \
algo/x14/x14.c \
algo/x14/x14-4way.c \
@@ -254,6 +262,13 @@ cpuminer_SOURCES = \
algo/x16/x16r-gate.c \
algo/x16/x16r.c \
algo/x16/x16r-4way.c \
algo/x16/x16rv2.c \
algo/x16/x16rv2-4way.c \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x17/x17-gate.c \
algo/x17/x17.c \
algo/x17/x17-4way.c \
@@ -267,7 +282,9 @@ cpuminer_SOURCES = \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/sha256_p.c \
algo/yespower/yespower-opt.c

View File

@@ -24,7 +24,7 @@ Requirements
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
optimizations a CPU with AES_NI is required. This includes Intel Westmere
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
@@ -55,9 +55,11 @@ Supported Algorithms
axiom Shabal-256 MemoHash
bastion
blake Blake-256 (SFR)
blakecoin blake256r8
blake2b Blake2b 256
blake2s Blake-2 S
blakecoin blake256r8
bmw BMW 256
bmw512 BMW 512
c11 Chaincoin
decred
deep Deepcoin (DCN)
@@ -66,6 +68,7 @@ Supported Algorithms
fresh Fresh
groestl Groestl coin
heavy Heavy
hex x16r-hex
hmq1725 Espers
hodl Hodlcoin
jha Jackpotcoin
@@ -84,10 +87,12 @@ Supported Algorithms
neoscrypt NeoScrypt(128, 2, 1)
nist5 Nist5
pentablake Pentablake
phi1612 phi, LUX coin (original algo)
phi2 LUX coin (new algo)
phi1612 phi
phi2 Luxcoin (LUX)
phi2-lux identical to phi2
pluck Pluck:128 (Supcoin)
polytimos Ninja
power2b MicroBitcoin (MBC)
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
@@ -113,12 +118,17 @@ Supported Algorithms
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13bcd bcd
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin (RVN)
x16s pigeoncoin (PGN)
x16r Ravencoin (RVN) (original algo)
x16rv2 Ravencoin (RVN) (new algo)
x16rt Gincoin (GIN)
x16rt_veil Veil (VEIL)
x16s Pigeoncoin (PGN)
x17
x21s
xevan Bitsend (BSD)
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)
@@ -126,6 +136,7 @@ Supported Algorithms
yescryptr32 WAVI
yespower Cryply
yespowerr16 Yenten (YTN)
yespoer-b2b generic yespower + blake2b
zr5 Ziftr
Errata
@@ -148,14 +159,15 @@ Benchmark testing does not work for x11evo.
Bugs
----
Users are encouraged to post their bug reports on the Bitcoin Talk
forum at:
Users are encouraged to post their bug reports using git issues or on the
Bitcoin Talk forum at:
https://bitcointalk.org/index.php?topic=1326803.0
All problem reports must be accompanied by a proper definition.
All problem reports must be accompanied by a proper problem definition.
This should include how the problem occurred, the command line and
output from the miner showing the startup and any errors.
output from the miner showing the startup messages and any errors.
A history is also useful, ie did it work before.
Donations
---------

View File

@@ -6,9 +6,6 @@ This feature requires recent SW including GCC version 5 or higher and
openssl version 1.1 or higher. It may also require using "-march=znver1"
compile flag.
cpuminer-opt is a console program, if you're using a mouse you're doing it
wrong.
Security warning
----------------
@@ -34,10 +31,79 @@ Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple and Android are not supported.
FreeBSD YMMV.
Change Log
----------
v3.9.9
Added power2b algo for MicroBitcoin.
Added generic yespower-b2b (yespower + blake2b) algo to be used with
the parameters introduced in v3.9.7 for yespower & yescrypt.
Display additional info when a share is rejected.
Some low level enhancements and minor tweaking of log output.
RELEASE_NOTES (this file) and README.md added to Windows release package.
v3.9.8.1
Summary log report will be generated on stratum diff change or after 5 minutes,
whichever comes first, to prevent incorrect data in the report.
Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
Until the fork Luxcoin can be mined using phi2 algo.
--hide-diff option is deprecated and has no effect. It will be removed in a
future release.
v3.9.8
Changes to log output to provide data more relevant to actual mining
performance.
phi2 can now handle pools with a mix of coins that use and don't use roots.
phi2-lux added as an alias for phi2 as they are identical except for roots.
Add x16rv2 algo for Ravencoin fork.
v3.9.7
Command line option changes:
"-R" is no longer used as a shortcut for "--retry-pause", users must
use the long option.
New options:
-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos
-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is
hardcoded with R=1
-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos.
These options can be used to mine yescrypt or yespower variations using
the generic yescrypt or yespower algo name and specifying the parameters
manually. They can even be used to mine variations that aren't formally
supported by a unique algo name. Existing algos can continue to to be mined
using their original name without parameters.
v3.9.6.2
New algo blake2b.
Faster myr-gr on Ryzen using SHA.
Faster blake2s SSE2.
Small speedup of around 1% for several other algos.
v3.9.6.1
New algos: x21s, hex (alias x16r-hex).
v3.9.6
New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
v3.9.5.4
Fixed sha256q AVX2 poor performance.

View File

@@ -122,7 +122,6 @@ void init_algo_gate( algo_gate_t* gate )
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
gate->malloc_txs_request = (void*)&std_malloc_txs_request;
gate->set_target = (void*)&std_set_target;
gate->submit_getwork_result = (void*)&std_le_submit_getwork_result;
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
@@ -167,9 +166,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break;
case ALGO_BLAKE: register_blake_algo ( gate ); break;
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
case ALGO_C11: register_c11_algo ( gate ); break;
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
@@ -181,6 +181,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_FRESH: register_fresh_algo ( gate ); break;
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
case ALGO_HEX: register_hex_algo ( gate ); break;
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
case ALGO_HODL: register_hodl_algo ( gate ); break;
case ALGO_JHA: register_jha_algo ( gate ); break;
@@ -203,6 +204,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_PHI2: register_phi2_algo ( gate ); break;
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
case ALGO_QUARK: register_quark_algo ( gate ); break;
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
@@ -227,12 +229,17 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13BCD: register_x13bcd_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
case ALGO_X15: register_x15_algo ( gate ); break;
case ALGO_X16R: register_x16r_algo ( gate ); break;
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: register_x16s_algo ( gate ); break;
case ALGO_X17: register_x17_algo ( gate ); break;
case ALGO_X21S: register_x21s_algo ( gate ); break;
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
/* case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
@@ -245,6 +252,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
@@ -327,19 +335,16 @@ const char* const algo_alias_map[][2] =
{ "lyra2", "lyra2re" },
{ "lyra2v2", "lyra2rev2" },
{ "lyra2v3", "lyra2rev3" },
{ "lyra2zoin", "lyra2z330" },
{ "myrgr", "myr-gr" },
{ "myriad", "myr-gr" },
{ "neo", "neoscrypt" },
{ "phi", "phi1612" },
// { "sia", "blake2b" },
{ "sib", "x11gost" },
{ "timetravel8", "timetravel" },
{ "ziftr", "zr5" },
{ "veil", "x16rt-veil" },
{ "x16r-hex", "hex" },
{ "yenten", "yescryptr16" },
{ "yescryptr8k", "yescrypt" },
{ "zcoin", "lyra2z" },
{ "zoin", "lyra2z330" },
{ "ziftr", "zr5" },
{ NULL, NULL }
};
@@ -361,40 +366,3 @@ void get_algo_alias( char** algo_or_alias )
#undef ALIAS
#undef PROPER
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr )
{
work_set_target_ratio( work, hash );
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
accepted_share_count + rejected_share_count + 1,
thr->id, work->job_id );
return true;
}
else
applog( LOG_WARNING, "Failed to submit share." );
return false;
}
bool submit_lane_solution( struct work *work, void *hash,
struct thr_info *thr, int lane )
{
work_set_target_ratio( work, hash );
if ( submit_work( thr, work ) )
{
if ( !opt_quiet )
// applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
// accepted_share_count + rejected_share_count + 1,
// thr->id, lane );
applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
accepted_share_count + rejected_share_count + 1, thr->id,
lane, work->job_id );
return true;
}
else
applog( LOG_WARNING, "Failed to submit share." );
return false;
}

View File

@@ -85,14 +85,16 @@
typedef uint32_t set_t;
#define EMPTY_SET 0
#define SSE2_OPT 1
#define AES_OPT 2
#define SSE42_OPT 4
#define AVX_OPT 8
#define AVX2_OPT 0x10
#define SHA_OPT 0x20
#define AVX512_OPT 0x40
#define EMPTY_SET 0
#define SSE2_OPT 1
#define AES_OPT 2
#define SSE42_OPT 4
#define AVX_OPT 8 // Sandybridge
#define AVX2_OPT 0x10 // Haswell
#define SHA_OPT 0x20 // sha256 (Ryzen, Ice Lake)
#define AVX512_OPT 0x40 // AVX512- F, VL, DQ, BW (Skylake-X)
#define VAES_OPT 0x80 // VAES (Ice Lake)
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -132,7 +134,6 @@ void ( *decode_extra_data ) ( struct work*, uint64_t* );
void ( *wait_for_diff ) ( struct stratum_ctx* );
int64_t ( *get_max64 ) ();
bool ( *work_decode ) ( const json_t*, struct work* );
void ( *set_target) ( struct work*, double );
bool ( *submit_getwork_result ) ( CURL*, struct work* );
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
@@ -193,15 +194,6 @@ void four_way_not_tested();
// allways returns failure
int null_scanhash();
// Allow algos to submit from scanhash loop.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr );
bool submit_lane_solution( struct work *work, void *hash,
struct thr_info *thr, int lane );
bool submit_work( struct thr_info *thr, const struct work *work_in );
// displays warning
void null_hash ();
void null_hash_suw();
@@ -232,10 +224,6 @@ int64_t get_max64_0x3fffffLL();
int64_t get_max64_0x1ffff();
int64_t get_max64_0xffffLL();
void std_set_target( struct work *work, double job_diff );
void alt_set_target( struct work* work, double job_diff );
void scrypt_set_target( struct work *work, double job_diff );
bool std_le_work_decode( const json_t *val, struct work *work );
bool std_be_work_decode( const json_t *val, struct work *work );
bool jr2_work_decode( const json_t *val, struct work *work );

View File

@@ -85,8 +85,9 @@ bool register_argon2_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_argon2;
gate->hash = (void*)&argon2hash;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&argon2_get_max64;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -36,43 +36,39 @@ void argon2d_crds_hash( void *output, const void *input )
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) endiandata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t _ALIGN(64) endiandata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
swab32_array( endiandata, pdata, 20 );
uint32_t nonce = first_nonce;
do {
be32enc(&endiandata[19], nonce);
argon2d_crds_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], nonce);
argon2d_crds_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio(work, hash);
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool register_argon2d_crds_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_crds;
gate->hash = (void*)&argon2d_crds_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;
}
@@ -107,43 +103,40 @@ void argon2d_dyn_hash( void *output, const void *input )
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) endiandata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t _ALIGN(64) endiandata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
swab32_array( endiandata, pdata, 20 );
uint32_t nonce = first_nonce;
do
{
be32enc(&endiandata[19], nonce);
argon2d_dyn_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], nonce);
argon2d_dyn_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio(work, hash);
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool register_argon2d_dyn_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_dyn;
gate->hash = (void*)&argon2d_dyn_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;
}
@@ -171,11 +164,10 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
be32enc( &endiandata[19], n );
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
{
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
submit_solution( work, vhash, mythr );
}
n++;
@@ -192,9 +184,9 @@ int64_t get_max64_0x1ff() { return 0x1ff; }
bool register_argon2d4096_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d4096;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&get_max64_0x1ff;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;
}

View File

@@ -28,6 +28,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mm_malloc.h>
#include "core.h"
#include "argon2d_thread.h"
@@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
if (context->allocate_cbk) {
(context->allocate_cbk)(memory, memory_size);
} else {
*memory = malloc(memory_size);
*memory = _mm_malloc( memory_size, 64 );
// *memory = malloc(memory_size);
}
if (*memory == NULL) {
@@ -116,7 +118,8 @@ void free_memory(const argon2_context *context, uint8_t *memory,
if (context->free_cbk) {
(context->free_cbk)(memory, memory_size);
} else {
free(memory);
// free(memory);
_mm_free( memory );
}
}

View File

@@ -96,14 +96,14 @@ static void fill_block(__m256i *state, const block *ref_block,
if (with_xor) {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
block_XY[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
}
} else {
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm256_xor_si256(
state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
}
}
@@ -139,7 +139,7 @@ static void fill_block(__m256i *state, const block *ref_block,
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
_mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
}
}

View File

@@ -29,6 +29,8 @@
#include <x86intrin.h>
#endif
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(__AVX2__)
#if !defined(__XOP__)
@@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
#define rotr32 mm256_swap32_64
#define rotr24 mm256_ror3x8_64
#define rotr16 mm256_ror1x16_64
#define rotr63( x ) mm256_rol_64( x, 1 )
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
//#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
//#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
__m256i ml0, ml1; \
ml0 = _mm256_mul_epu32(A0, B0); \
ml1 = _mm256_mul_epu32(A1, B1); \
ml0 = _mm256_add_epi64(ml0, ml0); \
ml1 = _mm256_add_epi64(ml1, ml1); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr32(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
\
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr24(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D0 = rotr32(D0); \
D1 = rotr32(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
\
ml0 = _mm256_mul_epu32(C0, D0); \
ml1 = _mm256_mul_epu32(C1, D1); \
ml0 = _mm256_add_epi64(ml0, ml0); \
ml1 = _mm256_add_epi64(ml1, ml1); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
B0 = _mm256_xor_si256(B0, C0); \
B1 = _mm256_xor_si256(B1, C1); \
B0 = rotr24(B0); \
B1 = rotr24(B1); \
} while((void)0, 0);
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i ml = _mm256_mul_epu32(A0, B0); \
ml = _mm256_add_epi64(ml, ml); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
__m256i ml0, ml1; \
ml0 = _mm256_mul_epu32(A0, B0); \
ml1 = _mm256_mul_epu32(A1, B1); \
ml0 = _mm256_add_epi64(ml0, ml0); \
ml1 = _mm256_add_epi64(ml1, ml1); \
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
D0 = _mm256_xor_si256(D0, A0); \
D0 = rotr16(D0); \
\
ml = _mm256_mul_epu32(C0, D0); \
ml = _mm256_add_epi64(ml, ml); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
B0 = _mm256_xor_si256(B0, C0); \
B0 = rotr63(B0); \
\
ml = _mm256_mul_epu32(A1, B1); \
ml = _mm256_add_epi64(ml, ml); \
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
D1 = _mm256_xor_si256(D1, A1); \
D0 = rotr16(D0); \
D1 = rotr16(D1); \
\
ml = _mm256_mul_epu32(C1, D1); \
ml = _mm256_add_epi64(ml, ml); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
ml0 = _mm256_mul_epu32(C0, D0); \
ml1 = _mm256_mul_epu32(C1, D1); \
ml0 = _mm256_add_epi64(ml0, ml0); \
ml1 = _mm256_add_epi64(ml1, ml1); \
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
B0 = _mm256_xor_si256(B0, C0); \
B1 = _mm256_xor_si256(B1, C1); \
B0 = rotr63(B0); \
B1 = rotr63(B1); \
} while((void)0, 0);
@@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
} while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
\
tmp1 = C0; \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
C1 = tmp1; \
\
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
} while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \

View File

@@ -308,12 +308,12 @@ static const sph_u32 CS[16] = {
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
_mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
_mm_set1_epi32( c1 ), m0 ), b ), a ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
_mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
_mm_set1_epi32( c0 ), m1 ), b ), a ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -508,14 +508,18 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
m128_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
BLAKE256_4WAY_BLOCK_BSWAP32; \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
@@ -631,16 +635,20 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0xA4093822A4093822 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
m256_const1_64( 0x299F31D0299F31D0 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -696,14 +704,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
__m128i zero = m128_zero;
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
casti_m128i( ctx->S, 0 ) = zero;
casti_m128i( ctx->S, 1 ) = zero;
@@ -778,12 +786,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
else
ctx->T0 -= 512 - bit_len;
buf[vptr] = _mm_set1_epi32( 0x80 );
buf[vptr] = m128_const1_64( 0x0000008000000080 );
if ( vptr < 12 )
{
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -795,7 +804,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
buf[ 13 ] = _mm_or_si128( buf[ 13 ],
m128_const1_64( 0x0100000001000000ULL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( ctx, buf, 64 );
@@ -815,20 +825,18 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
{
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
casti_m256i( sc->S, 2 ) = zero;
casti_m256i( sc->S, 3 ) = zero;
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
@@ -887,7 +895,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -909,7 +917,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) );
m256_const1_64( 0x0100000001000000ULL ) );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -922,7 +930,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );

View File

@@ -1,322 +0,0 @@
// convert blake256 32 bit to use 64 bit with serial vectoring
//
// cut calls to GS in half
//
// combine V
// v0 = {V0,V1}
// v1 = {V2,V3}
// v2 = {V4,V5}
// v3 = {V6,V7}
// v4 = {V8,V9}
// v5 = {VA,VB}
// v6 = {VC,VD}
// v7 = {CE,VF}
//
// v6x = {VD,VC} swap(VC,VD) swap(v6)
// v7x = {VF,VE} swap(VE,VF) swap(v7)
//
// V0 = v1v0
// V1 = v3v2
// V2 = v5v4
// V3 = v7v6
// V4 = v9v8
// V5 = vbva
// V6 = vdvc
// V7 = vfve
//
// The rotate in ROUND is to effect straddle and unstraddle for the third
// and 4th iteration of GS.
// It concatenates 2 contiguous 256 bit vectors and extracts the middle
// 256 bits. After the transform they must be restored with only the
// chosen bits modified in the original 2 vectors.
// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
// 256 bit vector and saves the untouched bits temporailly in arg0, the
// "high" 256 bit vector. Simply reverse the process to restore data back
// to original positions.
// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
//
// Data is organised the same as 32 bit 4 way, in effect serial vectoring
// on top of parallel vectoring. Same data in the same place just taking
// two chunks at a time.
//
// Transparent to user, x2 mode used when AVX2 detected.
// Use existing 4way context but revert to scalar types.
// Same interleave function (128 bit) or x2 with 256 bit?
// User trsnaparency would have to apply to interleave as well.
//
// Use common 4way update and close
/*
typedef struct {
unsigned char buf[64<<2];
uint32_t H[8<<2];
uint32_t S[4<<2];
size_t ptr;
uint32_t T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blakex2_4way_small_context __attribute__ ((aligned (64)));
*/
static void
blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
casti_m128i( ctx->S, 0 ) = m128_zero;
casti_m128i( ctx->S, 1 ) = m128_zero;
casti_m128i( ctx->S, 2 ) = m128_zero;
casti_m128i( ctx->S, 3 ) = m128_zero;
/*
sc->S[0] = _mm_set1_epi32( salt[0] );
sc->S[1] = _mm_set1_epi32( salt[1] );
sc->S[2] = _mm_set1_epi32( salt[2] );
sc->S[3] = _mm_set1_epi32( salt[3] );
*/
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
}
static void
blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
{
__m128i *buf = (__m256i*)ctx->buf;
size_t bptr = ctx->ptr << 2;
size_t vptr = ctx->ptr >> 3;
size_t blen = len << 2;
// unsigned char *buf = ctx->buf;
// size_t ptr = ctx->ptr<<4; // repurposed
DECL_STATE32x2
// buf = sc->buf;
// ptr = sc->ptr;
// adjust len for use with ptr, clen, all absolute bytes.
// int blen = len<<2;
if ( blen < (sizeof ctx->buf) - bptr )
{
memcpy( buf + vptr, data, blen );
ptr += blen;
ctx->ptr = bptr >> 2;;
return;
}
READ_STATE32( ctx );
while ( blen > 0 )
{
size_t clen;
clen = ( sizeof sc->buf ) - ptr;
if ( clen > blen )
clen = blen;
memcpy( buf + vptr, data, clen );
bptr += clen;
vptr = bptr >> 5;
data = (const unsigned char *)data + clen;
blen -= clen;
if ( bptr == sizeof ctx->buf )
{
if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
T1 += 1;
COMPRESS32x2_4WAY( ctx->rounds );
ptr = 0;
}
}
WRITE_STATE32x2( ctx );
ctx->ptr = bptr >> 2;
}
static void
blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
{
__m256i buf[8] __attribute__ ((aligned (64)));
size_t ptr = ctx->ptr;
size_t vptr = ctx->ptr>>2;
unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane
uint32_t th = ctx->T1;
uint32_t tl = ctx->T0 + bit_len;
if ( ptr == 0 )
{
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
}
else if ( ctx->T0 == 0 )
{
ctx->T0 = 0xFFFFFE00UL + bit_len;
ctx->T1 -= 1;
}
else
ctx->T0 -= 512 - bit_len;
// memset doesn't do ints
buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
if ( vptr < 5 )
{
memset_zero_256( buf + vptr + 1, 6 - vptr );
buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) );
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
th,th,th,th ) );
blake32x2_4way( ctx, buf + vptr, 64 - ptr );
}
else
{
memset_zero_256( vbuf + vptr + 1, 7 - vptr );
blake32x2_4way( ctx, vbuf + ptr, 64 - ptr );
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
buf[ 6 ] = mm256_zero;
buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
th, th, th, th );
blake32x2_4way( ctx, buf, 64 );
}
casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
}
#define DECL_STATE32x2_4WAY \
__m256i H0, H1, H2, H3; \
__m256i S0, S1; \
uint32_t T0, T1;
#define READ_STATE32x2_4WAY(state) do \
{ \
H0 = casti_m256i( state->H, 0 ); \
H1 = casti_m256i( state->H, 1 ); \
H2 = casti_m256i( state->H, 2 ); \
H3 = casti_m256i( state->H, 3 ); \
S0 = casti_m256i( state->S, 0 ); \
S1 = casti_m256i( state->S, 1 ); \
T0 = state->T0; \
T1 = state->T1; \
#define WRITE_STATE32x2_4WAY(state) do { \
casti_m256i( state->H, 0 ) = H0; \
casti_m256i( state->H, 1 ) = H1; \
casti_m256i( state->H, 2 ) = H2; \
casti_m256i( state->H, 3 ) = H3; \
casti_m256i( state->S, 0 ) = S0; \
casti_m256i( state->S, 1 ) = S1; \
state->T0 = T0; \
state->T1 = T1; \
} while (0)
#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
{ \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
_mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
_mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
#define ROUND_Sx2_4WAY(r) do \
{ \
GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \
CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \
CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
mm256_ror1x128_512( V3, V2 ); \
mm256_ror1x128_512( V6, V7 ); \
GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \
CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \
CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
mm256_rol1x128_512( V2, V3 ); \
mm256_rol1x128_512( V7, V6 );
#define COMPRESS32x2_4WAY( rounds ) do \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
CS0, CS0, CS0, CS0 ) ); \
V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
CS2, CS2, CS2, CS2 ) ); \
V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
_mm256_set_epi32( CS5, CS5, CS5, CS5, \
CS4, CS4, CS4, CS4 ) ); \
V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
_mm256_set_epi32( CS7, CS7, CS7, CS7, \
CS6, CS6, CS6, CS6 ) ); \
M0 = mm256_bswap_32( buf[ 0] ); \
M1 = mm256_bswap_32( buf[ 1] ); \
M2 = mm256_bswap_32( buf[ 2] ); \
M3 = mm256_bswap_32( buf[ 3] ); \
M4 = mm256_bswap_32( buf[ 4] ); \
M5 = mm256_bswap_32( buf[ 5] ); \
M6 = mm256_bswap_32( buf[ 6] ); \
M7 = mm256_bswap_32( buf[ 7] ); \
ROUND_Sx2_4WAY(0); \
ROUND_Sx2_4WAY(1); \
ROUND_Sx2_4WAY(2); \
ROUND_Sx2_4WAY(3); \
ROUND_Sx2_4WAY(4); \
ROUND_Sx2_4WAY(5); \
ROUND_Sx2_4WAY(6); \
ROUND_Sx2_4WAY(7); \
if (rounds == 14) \
{ \
ROUND_Sx2_4WAY(8); \
ROUND_Sx2_4WAY(9); \
ROUND_Sx2_4WAY(0); \
ROUND_Sx2_4WAY(1); \
ROUND_Sx2_4WAY(2); \
ROUND_Sx2_4WAY(3); \
} \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
} while (0)

67
algo/blake/blake2b-4way.c Normal file
View File

@@ -0,0 +1,67 @@
/**
* Blake2-B Implementation
* tpruvot@github 2015-2016
*/
#include "blake2b-gate.h"
#if defined(BLAKE2B_4WAY)
#include <string.h>
#include <stdint.h>
#include "blake2b-hash-4way.h"
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] < Htarg )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

25
algo/blake/blake2b-gate.c Normal file
View File

@@ -0,0 +1,25 @@
#include "blake2b-gate.h"
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
*/
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_4WAY)
gate->scanhash = (void*)&scanhash_blake2b_4way;
gate->hash = (void*)&blake2b_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
// gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = AVX2_OPT;
return true;
};

26
algo/blake/blake2b-gate.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef __BLAKE2B_GATE_H__
#define __BLAKE2B_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX2__)
#define BLAKE2B_4WAY
#endif
bool register_blake2b_algo( algo_gate_t* gate );
#if defined(BLAKE2B_4WAY)
void blake2b_4way_hash( void *state, const void *input );
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void blake2b_hash( void *state, const void *input );
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -0,0 +1,215 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "blake2b-hash-4way.h"
#if defined(__AVX2__)
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) \
{ \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
}
// Initialization Vector.
/*
static const uint64_t blake2b_iv[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
__m256i v[16], m[16];
v[ 0] = ctx->h[0];
v[ 1] = ctx->h[1];
v[ 2] = ctx->h[2];
v[ 3] = ctx->h[3];
v[ 4] = ctx->h[4];
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
v[12] = m256_const1_64( 0x510E527FADE682D1 );
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
if ( last )
v[14] = mm256_not( v[14] );
m[ 0] = ctx->b[ 0];
m[ 1] = ctx->b[ 1];
m[ 2] = ctx->b[ 2];
m[ 3] = ctx->b[ 3];
m[ 4] = ctx->b[ 4];
m[ 5] = ctx->b[ 5];
m[ 6] = ctx->b[ 6];
m[ 7] = ctx->b[ 7];
m[ 8] = ctx->b[ 8];
m[ 9] = ctx->b[ 9];
m[10] = ctx->b[10];
m[11] = ctx->b[11];
m[12] = ctx->b[12];
m[13] = ctx->b[13];
m[14] = ctx->b[14];
m[15] = ctx->b[15];
for ( i = 0; i < 12; i++ )
{
B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
B2B_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
B2B_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
}
ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
{
size_t i;
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;
ctx->c = 0;
ctx->outlen = 32;
for ( i = 0; i < 16; i++ )
ctx->b[i] = m256_zero;
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
size_t i, c;
c = ctx->c >> 3;
for ( i = 0; i < (inlen >> 3); i++ )
{
if ( ctx->c == 128 )
{
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
ctx->c += 8;
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
while ( ctx->c < 128 )
{
ctx->b[c++] = m256_zero;
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];
casti_m256i( out, 2 ) = ctx->h[2];
casti_m256i( out, 3 ) = ctx->h[3];
}
#endif

View File

@@ -0,0 +1,35 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#if defined(__AVX2__)
#include "simd-utils.h"
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#include <inttypes.h>
#define inline __inline
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__((aligned(x)))
#endif
// state context
ALIGN(64) typedef struct {
__m256i b[16]; // input buffer
__m256i h[8]; // chained state
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx __attribute__((aligned(64)));
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
#endif
#endif

View File

@@ -3,13 +3,11 @@
* tpruvot@github 2015-2016
*/
#include "algo-gate-api.h"
#include "blake2b-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
//static __thread sph_blake2b_ctx s_midstate;
//static __thread sph_blake2b_ctx s_ctx;
#define MIDLEN 76
#define A 64
@@ -25,16 +23,6 @@ void blake2b_hash(void *output, const void *input)
memcpy(output, hash, 32);
}
/*
static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.outlen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
sph_blake2b_final(&s_ctx, (uint8_t*) output);
}
*/
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -45,7 +33,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[8];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
@@ -53,179 +41,23 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
// midstate (untested yet)
//blake2b_init(&s_midstate, 32, NULL, 0);
//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
do {
be32enc(&endiandata[8], n);
be32enc(&endiandata[19], n);
//blake2b_hash_end(vhashcpu, endiandata);
blake2b_hash(vhashcpu, endiandata);
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
work_set_target_ratio(work, vhashcpu);
*hashes_done = n - first_nonce + 1;
pdata[8] = n;
pdata[19] = n;
return 1;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[8] = n;
pdata[19] = n;
return 0;
}
static inline void swab256(void *dest_p, const void *src_p)
{
uint32_t *dest = (uint32_t *)dest_p;
const uint32_t *src = (uint32_t *)src_p;
dest[0] = swab32(src[7]);
dest[1] = swab32(src[6]);
dest[2] = swab32(src[5]);
dest[3] = swab32(src[4]);
dest[4] = swab32(src[3]);
dest[5] = swab32(src[2]);
dest[6] = swab32(src[1]);
dest[7] = swab32(src[0]);
}
/* compute nbits to get the network diff */
void blake2b_calc_network_diff(struct work *work)
{
// sample for diff 43.281 : 1c05ea29
uint32_t nbits = work->data[11]; // unsure if correct
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
net_diff = d;
}
void blake2b_be_build_stratum_request( char *req, struct work *work )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
uint16_t high_nonce = swab32(work->data[9]) >> 16;
xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free( xnonce2str );
}
#define min(a,b) (a>b ? (b) :(a))
// merkle root handled here, no need for gen_merkle_root gate target
void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
uchar merkle_root[64] = { 0 };
uint32_t extraheader[32] = { 0 };
int headersize = 0;
size_t t;
int i;
// merkle root
memcpy( merkle_root, sctx->job.coinbase, 32 );
headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
// g_work->data[0] = le32dec( sctx->job.version );
// for ( i = 0; i < 8; i++ )
// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
// for ( i = 0; i < 8; i++ )
// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
g_work->data[8] = 0; // nonce
g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 );
g_work->data[10] = be32dec( sctx->job.ntime );
g_work->data[11] = be32dec( sctx->job.nbits );
for ( i = 0; i < 8; i++ )
g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
}
#undef min
void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t* end_nonce_ptr, bool clean_job )
{
const int wkcmp_sz = 32; // bytes
const int wkcmp_off = 32 + 16;
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| strcmp( work->job_id, g_work->job_id ) ) )
{
work_free( work );
work_copy( work, g_work );
*nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
if ( opt_randomize )
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
}
else
++(*nonceptr);
// suprnova job_id check without data/target/height change...
// we just may have copied new g_wwork to work so why this test here?
// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) )
// exit thread loop
// continue;
// else
// {
// nonceptr[1] += 0x10;
// nonceptr[1] |= thr_id;
// }
}
bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
int thr_id )
{
if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
// need to regen g_work..
return false;
// extradata: prevent duplicates
work->data[ 8 ] += 0x10;
work->data[ 8 + 1 ] |= thr_id;
return true;
}
double blake2b_get_max64() { return 0x1fffffLL; }
bool register_blake2b_algo( algo_gate_t* gate )
{
algo_not_tested();
gate->ntime_index = 10;
gate->nbits_index = 11;
gate->nonce_index = 8;
gate->work_cmp_size = 32;
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
gate->calc_network_diff = (void*)&blake2b_calc_network_diff;
gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
gate->build_extraheader = (void*)&blake2b_build_extraheader;
gate->get_new_work = (void*)&blake2b_get_new_work;
gate->get_max64 = (void*)&blake2b_get_max64;
gate->ready_to_mine = (void*)&blake2b_ready_to_mine;
have_gbt = false;
return true;
}

View File

@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
gate->hash = (void*)&blake2s_hash;
#endif
gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
return true;
};

View File

@@ -4,7 +4,8 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__SSE4_2__)
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
#if defined(__AVX2__)

View File

@@ -17,7 +17,9 @@
#include <string.h>
#include <stdio.h>
#if defined(__SSE4_2__)
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
static const uint32_t blake2s_IV[8] =
{
@@ -57,8 +59,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_4way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
@@ -267,8 +279,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );

View File

@@ -14,7 +14,8 @@
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE4_2__)
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#include "simd-utils.h"

View File

@@ -307,12 +307,12 @@ static const sph_u64 CB[16] = {
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -479,20 +479,20 @@ static const sph_u64 CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB4 ) ); \
m256_const1_64( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set1_epi64x( CB5 ) ); \
m256_const1_64( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB6 ) ); \
m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set1_epi64x( CB7 ) ); \
shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
m256_const1_64( CB7 ) ); \
shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -544,14 +544,14 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
const sph_u64 *salt )
{
__m256i zero = m256_zero;
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
casti_m256i( sc->S, 0 ) = zero;
casti_m256i( sc->S, 1 ) = zero;
@@ -642,11 +642,9 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
m256_const1_64( 0x0100000000000000ULL ) );
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -659,11 +657,9 @@ blake64_4way_close( blake_4way_big_context *sc,
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( buf, 112>>3 );
if ( out_size_w64 == 8 )
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}

View File

@@ -116,7 +116,7 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
// block header suffix from coinb2 (stake version)
memcpy( &g_work->data[44],
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
sctx->bloc_height = g_work->data[32];
sctx->block_height = g_work->data[32];
//applog_hex(work->data, 180);
//applog_hex(&work->data[36], 36);
}

View File

@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
v[13] ^= ctx->t[1]; // high 64 bits
if (last) // last block flag set ?
v[14] = ~v[14];
for (i = 0; i < 16; i++) // get little-endian words
m[i] = B2B_GET64(&ctx->b[8 * i]);
@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )
while (ctx->c < 128) // fill up with zeros
ctx->b[ctx->c++] = 0;
blake2b_compress(ctx, 1); // final block flag = 1
blake2b_compress(ctx, 1); // final block flag = 1
// little endian convert and store
for (i = 0; i < ctx->outlen; i++) {

View File

@@ -62,7 +62,7 @@ typedef struct {
typedef bmw_4way_small_context bmw256_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way_init( bmw256_4way_context *ctx );
void bmw256_4way(void *cc, const void *data, size_t len);

View File

@@ -48,7 +48,7 @@ extern "C"{
#if defined(__SSE2__)
// BMW-256 4 way 32
/*
static const uint32_t IV256[] = {
0x40414243, 0x44454647,
0x48494A4B, 0x4C4D4E4F,
@@ -59,6 +59,7 @@ static const uint32_t IV256[] = {
0x70717273, 0x74757677,
0x78797A7B, 0x7C7D7E7F
};
*/
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
@@ -462,13 +463,30 @@ static const __m128i final_s[16] =
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
void bmw256_4way_init( bmw256_4way_context *ctx )
{
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
sc->ptr = 0;
sc->bit_count = 0;
ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m128_const1_64( 0x7071727370717273 );
ctx->H[13] = m128_const1_64( 0x7475767774757677 );
ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = _mm_set1_epi32( iv[i] );
ctx->ptr = 0;
ctx->bit_count = 0;
}
static void
@@ -525,7 +543,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
ptr += 4;
h = sc->H;
@@ -551,11 +569,13 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
casti_m128i( dst, u ) = h1[v];
}
/*
void
bmw256_4way_init(void *cc)
{
bmw32_4way_init(cc, IV256);
}
*/
void
bmw256_4way(void *cc, const void *data, size_t len)
@@ -1003,25 +1023,24 @@ static const __m256i final_s8[16] =
void bmw256_8way_init( bmw256_8way_context *ctx )
{
ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
ctx->H[10] = _mm256_set1_epi32( IV256[10] );
ctx->H[11] = _mm256_set1_epi32( IV256[11] );
ctx->H[12] = _mm256_set1_epi32( IV256[12] );
ctx->H[13] = _mm256_set1_epi32( IV256[13] );
ctx->H[14] = _mm256_set1_epi32( IV256[14] );
ctx->H[15] = _mm256_set1_epi32( IV256[15] );
ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m256_const1_64( 0x7071727370717273 );
ctx->H[13] = m256_const1_64( 0x7475767774757677 );
ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
@@ -1074,7 +1093,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
ptr += 4;
h = ctx->H;
@@ -1089,7 +1108,6 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
buf[ (buf_size - 4) >> 2 ] = m256_zero;
compress_small_8way( buf, h, h2 );
for ( u = 0; u < 16; u ++ )

59
algo/bmw/bmw512-4way.c Normal file
View File

@@ -0,0 +1,59 @@
#include "bmw512-gate.h"
#ifdef BMW512_4WAY
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
void bmw512hash_4way(void *state, const void *input)
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
bmw512_4way( &ctx, input, 80 );
bmw512_4way_close( &ctx, state );
}
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t hash[16*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
bmw512hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

20
algo/bmw/bmw512-gate.c Normal file
View File

@@ -0,0 +1,20 @@
#include "bmw512-gate.h"
int64_t bmw512_get_max64() { return 0x7ffffLL; }
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&bmw512_get_max64;
opt_target_factor = 256.0;
#if defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;
#endif
return true;
};

23
algo/bmw/bmw512-gate.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef BMW512_GATE_H__
#define BMW512_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#define BMW512_4WAY 1
#endif
#if defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void bmw512hash( void *state, const void *input );
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -961,8 +961,22 @@ static const __m256i final_b[16] =
static void
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
{
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm256_set1_epi64x( iv[i] );
sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
sc->ptr = 0;
sc->bit_count = 0;
}
@@ -1014,13 +1028,11 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
__m256i *buf;
__m256i h1[16], h2[16], *h;
size_t ptr, u, v;
unsigned z;
const int buf_size = 128; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
buf[ ptr>>3 ] = m256_const1_64( 0x80 );
ptr += 8;
h = sc->H;

53
algo/bmw/bmw512.c Normal file
View File

@@ -0,0 +1,53 @@
#include "algo-gate-api.h"
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_bmw.h"
void bmw512hash(void *state, const void *input)
{
sph_bmw512_context ctx;
uint32_t hash[32];
sph_bmw512_init( &ctx );
sph_bmw512( &ctx,input, 80 );
sph_bmw512_close( &ctx, hash );
memcpy( state, hash, 32 );
}
int scanhash_bmw512( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
//const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t _ALIGN(32) hash64[8];
uint32_t endiandata[32];
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
bmw512hash(hash64, endiandata);
if (((hash64[7]&0xFFFFFF00)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -7,6 +7,7 @@
// 2x128
/*
// The result of hashing 10 rounds of initial data which consists of params
// zero padded.
static const uint64_t IV256[] =
@@ -24,13 +25,14 @@ static const uint64_t IV512[] =
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
*/
static void transform_2way( cube_2way_context *sp )
{
int r;
const int rounds = sp->rounds;
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
x0 = _mm256_load_si256( (__m256i*)sp->h );
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
@@ -47,18 +49,12 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
_mm256_srli_epi32( y0, 25 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
_mm256_srli_epi32( y1, 25 ) );
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 7 ),
_mm256_srli_epi32( y2, 25 ) );
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 7 ),
_mm256_srli_epi32( y3, 25 ) );
y0 = x0;
y1 = x1;
x0 = mm256_rol_32( x2, 7 );
x1 = mm256_rol_32( x3, 7 );
x2 = mm256_rol_32( y0, 7 );
x3 = mm256_rol_32( y1, 7 );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
@@ -71,18 +67,12 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
_mm256_srli_epi32( y0, 21 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
_mm256_srli_epi32( y1, 21 ) );
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
_mm256_srli_epi32( y2, 21 ) );
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
_mm256_srli_epi32( y3, 21 ) );
y0 = x0;
y1 = x2;
x0 = mm256_rol_32( x1, 11 );
x1 = mm256_rol_32( y0, 11 );
x2 = mm256_rol_32( x3, 11 );
x3 = mm256_rol_32( y1, 11 );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
@@ -107,23 +97,40 @@ static void transform_2way( cube_2way_context *sp )
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
int blockbytes )
{
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
__m128i* h = (__m128i*)sp->h;
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
__m256i* h = (__m256i*)sp->h;
h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
if ( hashbitlen == 512 )
{
h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
}
else
{
h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
}
return 0;
}
@@ -165,7 +172,7 @@ int cube_2way_close( cube_2way_context *sp, void *output )
for ( i = 0; i < 10; ++i ) transform_2way( sp );
for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->h[i];
memcpy( hash, sp->h, sp->hashlen<<5 );
return 0;
}
@@ -198,7 +205,7 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
for ( i = 0; i < 10; ++i ) transform_2way( sp );
for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->h[i];
memcpy( hash, sp->h, sp->hashlen<<5 );
return 0;
}

View File

@@ -16,24 +16,6 @@
#include "simd-utils.h"
#include <stdio.h>
// The result of hashing 10 rounds of initial data which is params and
// mostly zeros.
static const uint64_t IV256[] =
{
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
};
static const uint64_t IV512[] =
{
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
static void transform( cubehashParam *sp )
{
int r;
@@ -53,26 +35,22 @@ static void transform( cubehashParam *sp )
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = x0;
x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
_mm256_srli_epi32( x1, 25 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
_mm256_srli_epi32( y0, 25 ) );
x0 = mm256_rol_32( x1, 7 );
x1 = mm256_rol_32( y0, 7 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = _mm256_shuffle_epi32( x2, 0x4e );
x3 = _mm256_shuffle_epi32( x3, 0x4e );
x2 = mm256_swap64_128( x2 );
x3 = mm256_swap64_128( x3 );
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = _mm256_permute4x64_epi64( x0, 0x4e );
y1 = _mm256_permute4x64_epi64( x1, 0x4e );
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
_mm256_srli_epi32( y0, 21 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
_mm256_srli_epi32( y1, 21 ) );
y0 = mm256_swap_128( x0 );
y1 = mm256_swap_128( x1 );
x0 = mm256_rol_32( y0, 11 );
x1 = mm256_rol_32( y1, 11 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = _mm256_shuffle_epi32( x2, 0xb1 );
x3 = _mm256_shuffle_epi32( x3, 0xb1 );
x2 = mm256_swap32_64( x2 );
x3 = mm256_swap32_64( x3 );
}
_mm256_store_si256( (__m256i*)sp->x, x0 );
@@ -147,37 +125,58 @@ static void transform( cubehashParam *sp )
#endif
} // transform
/*
// The result of hashing 10 rounds of initial data which is params and
// mostly zeros.
static const uint64_t IV256[] =
{
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
};
static const uint64_t IV512[] =
{
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
*/
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
{
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
__m128i *x = (__m128i*)sp->x;
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
#if defined(__AVX2__)
__m256i* x = (__m256i*)sp->x;
if ( hashbitlen == 512 )
{
x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
#else
__m128i* x = (__m128i*)sp->x;
x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
x[5] = _mm_set_epi64x( iv[11], iv[10] );
x[6] = _mm_set_epi64x( iv[13], iv[12] );
x[7] = _mm_set_epi64x( iv[15], iv[14] );
#endif
return SUCCESS;
}

View File

@@ -94,19 +94,14 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
return 0;
}
void groestl_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_dmd_gr_algo( algo_gate_t* gate )
{
init_groestl_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_groestl;
gate->hash = (void*)&groestlhash;
gate->set_target = (void*)&groestl_set_target;
gate->get_max64 = (void*)&get_max64_0x3ffff;
opt_target_factor = 256.0;
return true;
};

View File

@@ -10,7 +10,7 @@
#else
#include "aes_ni/hash-groestl.h"
#endif
#include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
typedef struct {
#ifdef NO_AES_NI
@@ -18,7 +18,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
sph_sha256_context sha;
SHA256_CTX sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -28,15 +28,15 @@ void init_myrgr_ctx()
#ifdef NO_AES_NI
sph_groestl512_init( &myrgr_ctx.groestl );
#else
init_groestl (&myrgr_ctx.groestl, 64 );
init_groestl ( &myrgr_ctx.groestl, 64 );
#endif
sph_sha256_init(&myrgr_ctx.sha);
SHA256_Init( &myrgr_ctx.sha );
}
void myriad_hash(void *output, const void *input)
{
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t _ALIGN(32) hash[16];
@@ -44,23 +44,22 @@ void myriad_hash(void *output, const void *input)
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
#endif
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
SHA256_Final( (unsigned char*)hash, &ctx.sha );
memcpy(output, hash, 32);
}
int scanhash_myriad( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
int scanhash_myriad( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated

View File

@@ -8,7 +8,7 @@
#include <string.h>
#include "aes_ni/hash-groestl.h"
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
typedef struct {
hashState_groestl groestl;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY
#endif

View File

@@ -15,11 +15,6 @@ pthread_barrier_t hodl_barrier;
// need to be passed.
unsigned char *hodl_scratchbuf = NULL;
void hodl_set_target( struct work* work, double diff )
{
diff_to_target(work->target, diff / 8388608.0 );
}
void hodl_le_build_stratum_request( char* req, struct work* work,
struct stratum_ctx *sctx )
{
@@ -170,7 +165,6 @@ bool register_hodl_algo( algo_gate_t* gate )
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
gate->set_target = (void*)&hodl_set_target;
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
gate->build_block_header = (void*)&hodl_build_block_header;
@@ -179,6 +173,7 @@ bool register_hodl_algo( algo_gate_t* gate )
gate->work_cmp_size = 76;
hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
allow_getwork = false;
opt_target_factor = 8388608.0;
return ( hodl_scratchbuf != NULL );
}

View File

@@ -246,18 +246,12 @@ do { \
} while (0)
*/
#define W0(x) Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
#define W6(x) \
do { \
__m256i t = x ## h; \
@@ -331,14 +325,14 @@ do { \
__m256i m2l = buf[5]; \
__m256i m3h = buf[6]; \
__m256i m3l = buf[7]; \
h0h = _mm256_xor_si256( h0h, m0h ); \
h0l = _mm256_xor_si256( h0l, m0l ); \
h1h = _mm256_xor_si256( h1h, m1h ); \
h1l = _mm256_xor_si256( h1l, m1l ); \
h2h = _mm256_xor_si256( h2h, m2h ); \
h2l = _mm256_xor_si256( h2l, m2l ); \
h3h = _mm256_xor_si256( h3h, m3h ); \
h3l = _mm256_xor_si256( h3l, m3l ); \
h0h = _mm256_xor_si256( h0h, m0h ); \
h0l = _mm256_xor_si256( h0l, m0l ); \
h1h = _mm256_xor_si256( h1h, m1h ); \
h1l = _mm256_xor_si256( h1l, m1l ); \
h2h = _mm256_xor_si256( h2h, m2h ); \
h2l = _mm256_xor_si256( h2l, m2l ); \
h3h = _mm256_xor_si256( h3h, m3h ); \
h3l = _mm256_xor_si256( h3l, m3l ); \
#define INPUT_BUF2 \
h4h = _mm256_xor_si256( h4h, m0h ); \
@@ -477,13 +471,48 @@ static const sph_u64 IV512[] = {
#endif
static void
jh_4way_init( jh_4way_context *sc, const void *iv )
void jh256_4way_init( jh_4way_context *sc )
{
uint64_t *v = (uint64_t*)iv;
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
// bswapped IV256
sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
sc->ptr = 0;
sc->block_count = 0;
}
void jh512_4way_init( jh_4way_context *sc )
{
// bswapped IV512
sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -542,7 +571,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
size_t numz, u;
sph_u64 l0, l1, l0e, l1e;
buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
buf[0] = m256_const1_64( 0x80ULL );
if ( sc->ptr == 0 )
numz = 48;
@@ -555,8 +584,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
l1 = SPH_T64(sc->block_count >> 55);
sph_enc64be( &l0e, l0 );
sph_enc64be( &l1e, l1 );
*(buf + (numz>>3) ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
*(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e );
*(buf + (numz>>3) ) = _mm256_set1_epi64x( l1e );
*(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e );
jh_4way_core( sc, buf, numz + 16 );
@@ -566,11 +595,13 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
memcpy_256( dst256, buf, 8 );
}
/*
void
jh256_4way_init(void *cc)
{
jh_4way_init(cc, IV256);
jhs_4way_init(cc, IV256);
}
*/
void
jh256_4way(void *cc, const void *data, size_t len)
@@ -584,11 +615,13 @@ jh256_4way_close(void *cc, void *dst)
jh_4way_close(cc, 0, 0, dst, 8, IV256);
}
/*
void
jh512_4way_init(void *cc)
{
jh_4way_init(cc, IV512);
jhb_4way_init(cc, IV512);
}
*/
void
jh512_4way(void *cc, const void *data, size_t len)

View File

@@ -79,13 +79,13 @@ typedef jh_4way_context jh256_4way_context;
typedef jh_4way_context jh512_4way_context;
void jh256_4way_init(void *cc);
void jh256_4way_init( jh_4way_context *sc);
void jh256_4way(void *cc, const void *data, size_t len);
void jh256_4way_close(void *cc, void *dst);
void jh512_4way_init(void *cc);
void jh512_4way_init( jh_4way_context *sc );
void jh512_4way(void *cc, const void *data, size_t len);

View File

@@ -12,7 +12,7 @@ bool register_jha_algo( algo_gate_t* gate )
gate->hash = (void*)&jha_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&scrypt_set_target;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -39,10 +39,10 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
keccakhash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );

View File

@@ -1,18 +1,13 @@
#include "keccak-gate.h"
void keccak_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
}
int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
opt_target_factor = 128.0;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
@@ -23,17 +18,12 @@ bool register_keccak_algo( algo_gate_t* gate )
return true;
};
void keccakc_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_keccakc_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->set_target = (void*)&keccakc_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
opt_target_factor = 256.0;
#if defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;

View File

@@ -370,18 +370,23 @@ static const sph_u64 RC[] = {
static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
{
int i;
for (i = 0; i < 25; i ++)
kc->w[i] = _mm256_setzero_si256();
__m256i zero = m256_zero;
__m256i neg1 = m256_neg1;
// Initialization for the "lane complement".
kc->w[ 1] = m256_neg1;
kc->w[ 2] = m256_neg1;
kc->w[ 8] = m256_neg1;
kc->w[12] = m256_neg1;
kc->w[17] = m256_neg1;
kc->w[20] = m256_neg1;
kc->ptr = 0;
kc->w[ 0] = zero; kc->w[ 1] = neg1;
kc->w[ 2] = neg1; kc->w[ 3] = zero;
kc->w[ 4] = zero; kc->w[ 5] = zero;
kc->w[ 6] = zero; kc->w[ 7] = zero;
kc->w[ 8] = neg1; kc->w[ 9] = zero;
kc->w[10] = zero; kc->w[11] = zero;
kc->w[12] = neg1; kc->w[13] = zero;
kc->w[14] = zero; kc->w[15] = zero;
kc->w[16] = zero; kc->w[17] = neg1;
kc->w[18] = zero; kc->w[19] = zero;
kc->w[20] = neg1; kc->w[21] = zero;
kc->w[22] = zero; kc->w[23] = zero;
kc->w[24] = zero; kc->ptr = 0;
kc->lim = 200 - (out_size >> 2);
}
@@ -441,8 +446,8 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
eb = 0x100 >> 8;
if ( kc->ptr == (lim - 8) )
{
uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = m256_const1_64( t );
j = 8;
}
else
@@ -450,8 +455,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
@@ -461,9 +465,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
NOT64( kc->w[12], kc->w[12] );
NOT64( kc->w[17], kc->w[17] );
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < m256_len; j++ )
u.tmp[j] = kc->w[j];
memcpy_256( dst, u.tmp, m256_len );
memcpy_256( dst, kc->w, m256_len );
}
void keccak256_4way_init( void *kc )

View File

@@ -71,7 +71,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
gate->set_target = (void*)&alt_set_target;
opt_target_factor = 256.0;
return true;
};
@@ -105,7 +105,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->set_target = (void*)&alt_set_target;
opt_target_factor = 256.0;
return true;
};
@@ -128,7 +128,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&alt_set_target;
opt_target_factor = 256.0;
return true;
};
@@ -148,7 +148,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&alt_set_target;
opt_target_factor = 256.0;
return true;
};
@@ -168,8 +168,8 @@ bool register_allium_algo( algo_gate_t* gate )
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&allium_get_max64_0xFFFFLL;
opt_target_factor = 256.0;
return true;
};
@@ -182,6 +182,7 @@ int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
void phi2_decode_extra_data( struct work *work )
{
phi2_use_roots = false;
if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
else for ( int i = 20; i < 36; i++ )
{
@@ -213,8 +214,8 @@ bool register_phi2_algo( algo_gate_t* gate )
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
gate->build_extraheader = (void*)&phi2_build_extraheader;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
#if defined(PHI2_4WAY)
gate->scanhash = (void*)&scanhash_phi2_4way;
#else

View File

@@ -60,7 +60,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
// int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
//====================================================================/
@@ -128,17 +128,22 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState( state );
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
@@ -227,7 +232,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
// int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
uint64_t instance = 0;
//====================================================================/
@@ -302,17 +307,21 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState( state );
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
@@ -405,7 +414,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
// int64_t i; //auxiliary iteration counter
//=======================================================================/
//======= Initializing the Memory Matrix and pointers to it =============//
@@ -459,17 +468,21 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
// if (state == NULL) {
// return -1;
// }
initState( state );
// initState( state );
//============================== Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
uint64_t *ptrWord = wholeMatrix;
uint64_t *ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
BLOCK_LEN_BLAKE2_SAFE_INT64 );
/*
for ( i = 0; i < nBlocksInput; i++ )
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
@@ -623,17 +636,21 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
initState( state );
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

View File

@@ -118,11 +118,6 @@ int64_t lyra2re_get_max64 ()
return 0xffffLL;
}
void lyra2re_set_target ( struct work* work, double job_diff )
{
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
}
bool register_lyra2re_algo( algo_gate_t* gate )
{
init_lyra2re_ctx();
@@ -130,7 +125,7 @@ bool register_lyra2re_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2re;
gate->hash = (void*)&lyra2re_hash;
gate->get_max64 = (void*)&lyra2re_get_max64;
gate->set_target = (void*)&lyra2re_set_target;
opt_target_factor = 128.0;
return true;
};

View File

@@ -86,7 +86,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
}
int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
@@ -94,12 +94,12 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -186,7 +186,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
@@ -194,12 +194,12 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m128i *noncev = (__m128i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;

View File

@@ -32,27 +32,27 @@ void l2v3_blake256_midstate( const void* input )
void lyra2rev3_hash( void *state, const void *input )
{
lyra2v3_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
uint8_t hash[128] __attribute__ ((aligned (64)));
#define hashA hash
#define hashB hash+64
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
lyra2v3_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
uint8_t hash[128] __attribute__ ((aligned (64)));
#define hashA hash
#define hashB hash+64
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
sph_blake256_close( &ctx.blake, hash );
memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
sph_blake256_close( &ctx.blake, hash );
LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
(const byte*) hash, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
(const byte*) hash, 32 );
LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
sph_bmw256( &ctx.bmw, hash, 32 );
sph_bmw256_close( &ctx.bmw, hash );
sph_bmw256( &ctx.bmw, hash, 32 );
sph_bmw256_close( &ctx.bmw, hash );
memcpy( state, hash, 32 );
}

View File

@@ -53,11 +53,6 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
return 0;
}
void lyra2z330_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2z330_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
@@ -76,7 +71,7 @@ bool register_lyra2z330_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2z330;
gate->hash = (void*)&lyra2z330_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z330_set_target;
opt_target_factor = 256.0;
return true;
};

View File

@@ -40,29 +40,32 @@
*/
inline void initState( uint64_t State[/*16*/] )
{
/*
#if defined (__AVX2__)
__m256i* state = (__m256i*)State;
state[0] = _mm256_setzero_si256();
state[1] = _mm256_setzero_si256();
state[2] = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2],
blake2b_IV[1], blake2b_IV[0] );
state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
blake2b_IV[5], blake2b_IV[4] );
const __m256i zero = m256_zero;
state[0] = zero;
state[1] = zero;
state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
const __m128i zero = m128_zero;
state[0] = _mm_setzero_si128();
state[1] = _mm_setzero_si128();
state[2] = _mm_setzero_si128();
state[3] = _mm_setzero_si128();
state[4] = _mm_set_epi64x( blake2b_IV[1], blake2b_IV[0] );
state[5] = _mm_set_epi64x( blake2b_IV[3], blake2b_IV[2] );
state[6] = _mm_set_epi64x( blake2b_IV[5], blake2b_IV[4] );
state[7] = _mm_set_epi64x( blake2b_IV[7], blake2b_IV[6] );
state[0] = zero;
state[1] = zero;
state[2] = zero;
state[3] = zero;
state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
#else
//First 512 bis are zeros
@@ -77,6 +80,8 @@ inline void initState( uint64_t State[/*16*/] )
State[14] = blake2b_IV[6];
State[15] = blake2b_IV[7];
#endif
*/
}
/**
@@ -250,43 +255,74 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
*/
inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
const uint64_t nBlocks, const uint64_t block_len )
{
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
// XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with
// the IV.
#if defined (__AVX2__)
register __m256i state0, state1, state2, state3;
register __m256i state0, state1, state2, state3;
state0 =
state1 = m256_zero;
state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
for ( int i = 0; i < nBlocks; i++ )
{
__m256i *in = (__m256i*)In;
state0 = _mm256_load_si256( (__m256i*)State );
state1 = _mm256_load_si256( (__m256i*)State + 1 );
state2 = _mm256_load_si256( (__m256i*)State + 2 );
state3 = _mm256_load_si256( (__m256i*)State + 3 );
state0 = _mm256_xor_si256( state0, in[0] );
state1 = _mm256_xor_si256( state1, in[1] );
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
In += block_len;
}
_mm256_store_si256( (__m256i*)State, state0 );
_mm256_store_si256( (__m256i*)State + 1, state1 );
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
_mm256_store_si256( (__m256i*)State, state0 );
_mm256_store_si256( (__m256i*)State + 1, state1 );
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0, state1, state2, state3, state4, state5, state6, state7;
state0 =
state1 =
state2 =
state3 = m128_zero;
state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
for ( int i = 0; i < nBlocks; i++ )
{
__m128i* in = (__m128i*)In;
state[0] = _mm_xor_si128( state[0], in[0] );
state[1] = _mm_xor_si128( state[1], in[1] );
state[2] = _mm_xor_si128( state[2], in[2] );
state[3] = _mm_xor_si128( state[3], in[3] );
state0 = _mm_xor_si128( state0, in[0] );
state1 = _mm_xor_si128( state1, in[1] );
state2 = _mm_xor_si128( state2, in[2] );
state3 = _mm_xor_si128( state3, in[3] );
//Applies the transformation f to the sponge's state
LYRA_12_ROUNDS_AVX( state[0], state[1], state[2], state[3],
state[4], state[5], state[6], state[7] );
LYRA_12_ROUNDS_AVX( state0, state1, state2, state3,
state4, state5, state6, state7 );
In += block_len;
}
_mm_store_si128( (__m128i*)State, state0 );
_mm_store_si128( (__m128i*)State + 1, state1 );
_mm_store_si128( (__m128i*)State + 2, state2 );
_mm_store_si128( (__m128i*)State + 3, state3 );
_mm_store_si128( (__m128i*)State + 4, state4 );
_mm_store_si128( (__m128i*)State + 5, state5 );
_mm_store_si128( (__m128i*)State + 6, state6 );
_mm_store_si128( (__m128i*)State + 7, state7 );
#else
State[0] ^= In[0];

View File

@@ -170,7 +170,8 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in,
const uint64_t nBlocks, const uint64_t block_len );
//---- Duplexes
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);

View File

@@ -19,100 +19,89 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n(double xt)
inline double exp_n( double xt )
{
if(xt < -700.0)
if ( xt < -700.0 )
return 0;
else if(xt > 700.0)
else if ( xt > 700.0 )
return 1e200;
else if(xt > -0.8e-8 && xt < 0.8e-8)
return (1.0 + xt);
else if ( xt > -0.8e-8 && xt < 0.8e-8 )
return ( 1.0 + xt );
else
return exp(xt);
return exp( xt );
}
inline double exp_n2(double x1, double x2)
inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
if ( xt < p1+1.e-200 )
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
else if ( xt > p1 && xt < p2 + 1.e-200 )
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
else if ( xt > p2 && xt < p3 + 1.e-200 )
return ( 1. / ( 1. + exp(xt) ) );
else if ( xt > p3 && xt < p4 )
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
else if ( xt > p4 - 1.e-200 && xt < p5 )
return ( exp(-xt) / ( 1. + exp(-xt) ) );
else if ( xt > p5 - 1.e-200 && xt < p6 )
return ( exp(-xt) );
else if (xt > p6 - 1.e-200)
else if ( xt > p6 - 1.e-200 )
return 0.;
}
double swit2_(double wvnmb)
double swit2_( double wvnmb )
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
return pow( ( 5.55243 * ( exp_n( -0.3 * wvnmb / 15.762 )
- exp_n( -0.6 * wvnmb / 15.762 ) ) ) * wvnmb, 0.5 )
/ 1034.66 * pow( sin( wvnmb / 65. ), 2. );
}
double GaussianQuad_N2(const double x1, const double x2)
double GaussianQuad_N2( const double x1, const double x2 )
{
double s=0.0;
double s = 0.0;
double x[6], w[6];
//gauleg(a2, b2, x, w);
double z1, z, xm, xl, pp, p3, p2, p1;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for(int i=1;i<=3;i++)
xm = 0.5 * ( x2 + x1 );
xl = 0.5 * ( x2 - x1 );
for( int i = 1; i <= 3; i++ )
{
z = (i == 1) ? 0.909632 : -0.0;
z = (i == 2) ? 0.540641 : z;
do
z = (i == 2) ? 0.540641 : ( (i == 1) ? 0.909632 : -0.0 );
do
{
p1 = z;
p2 = 1;
p3 = 0;
p3=1;
p2=z;
p1=((3.0 * z * z) - 1) / 2;
p3=p2;
p2=p1;
p1=((5.0 * z * p2) - (2.0 * z)) / 3;
p3=p2;
p2=p1;
p1=((7.0 * z * p2) - (3.0 * p3)) / 4;
p3=p2;
p2=p1;
p1=((9.0 * z * p2) - (4.0 * p3)) / 5;
pp=5*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > 3.0e-11);
p1 = ( ( 3.0 * z * z ) - 1 ) / 2;
p2 = p1;
p1 = ( ( 5.0 * z * p2 ) - ( 2.0 * z ) ) / 3;
p3 = p2;
p2 = p1;
p1 = ( ( 7.0 * z * p2 ) - ( 3.0 * p3 ) ) / 4;
p3 = p2;
p2 = p1;
p1 = ( ( 9.0 * z * p2 ) - ( 4.0 * p3 ) ) / 5;
pp = 5 * ( z * p1 - p2 ) / ( z * z - 1.0 );
z1 = z;
z = z1 - p1 / pp;
} while ( fabs( z - z1 ) > 3.0e-11 );
x[i]=xm-xl*z;
x[5+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[5+1-i]=w[i];
x[i] = xm - xl * z;
x[ 5+1-i ] = xm + xl * z;
w[i] = 2.0 * xl / ( ( 1.0 - z * z ) * pp * pp );
w[ 5+1-i ] = w [i];
}
for(int j=1; j<=5; j++) s += w[j]*swit2_(x[j]);
for( int j = 1; j <= 5; j++ ) s += w[j] * swit2_( x[j] );
return s;
}
uint32_t sw2_(int nnounce)
uint32_t sw2_( int nnounce )
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPSa))/450+100);
return ((uint32_t)(GaussianQuad_N2(0., wmax)*(1.+EPSa)*1.e6));
double wmax = ( ( sqrt( (double)(nnounce) ) * ( 1.+EPSa ) ) / 450+100 );
return ( (uint32_t)( GaussianQuad_N2( 0., wmax ) * ( 1.+EPSa ) * 1.e6 ) );
}
typedef struct {
@@ -323,7 +312,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
*hashes_done = n - first_nonce + 1;
return rc;
return 0;
}
bool register_m7m_algo( algo_gate_t *gate )
@@ -334,9 +323,9 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&get_max64_0x1ffff;
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
}

View File

@@ -49,7 +49,7 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
const uint32_t mask = 8;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i zero = _mm256_setzero_si256();
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

View File

@@ -21,7 +21,7 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
union _hmq1725_4way_context_overlay
{
@@ -57,7 +57,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
const __m256i vmask = _mm256_set1_epi64x( 24 );
const __m256i vmask = m256_const1_64( 24 );
const uint32_t mask = 24;
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;

View File

@@ -10,8 +10,8 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -409,14 +409,3 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
/*
bool register_hmq1725_algo( algo_gate_t* gate )
{
init_hmq1725_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&scrypt_set_target;
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
return true;
};
*/

View File

@@ -49,7 +49,7 @@ void quark_4way_hash( void *state, const void *input )
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i bit3_mask = m256_const1_64( 8 );
const uint32_t mask = 8;
const __m256i zero = _mm256_setzero_si256();

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "ripemd-hash-4way.h"
#define LBRY_INPUT_SIZE 112

View File

@@ -41,6 +41,7 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
free(xnonce2str);
}
/*
void lbry_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_root,
uint32_t ntime, uint32_t nbits )
@@ -63,6 +64,7 @@ void lbry_build_block_header( struct work* g_work, uint32_t version,
g_work->data[ LBRY_NBITS_INDEX ] = nbits;
g_work->data[28] = 0x80000000;
}
*/
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
@@ -92,11 +94,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
g_work->data[28] = 0x80000000;
}
void lbry_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
@@ -119,11 +116,11 @@ bool register_lbry_algo( algo_gate_t* gate )
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
// gate->build_block_header = (void*)&build_block_header;
gate->build_extraheader = (void*)&lbry_build_extraheader;
gate->set_target = (void*)&lbry_set_target;
gate->ntime_index = LBRY_NTIME_INDEX;
gate->nbits_index = LBRY_NBITS_INDEX;
gate->nonce_index = LBRY_NONCE_INDEX;
gate->get_work_data_size = (void*)&lbry_get_work_data_size;
opt_target_factor = 256.0;
return true;
}

View File

@@ -5,23 +5,26 @@
#include <stddef.h>
#include <string.h>
/*
static const uint32_t IV[5] =
{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
*/
/*
* Round constants for RIPEMD-160.
*/
#define K11 0x00000000
#define K12 0x5A827999
#define K13 0x6ED9EBA1
#define K14 0x8F1BBCDC
#define K15 0xA953FD4E
#define K21 0x50A28BE6
#define K22 0x5C4DD124
#define K23 0x6D703EF3
#define K24 0x7A6D76E9
#define K25 0x00000000
#define K11 0x0000000000000000
#define K12 0x5A8279995A827999
#define K13 0x6ED9EBA16ED9EBA1
#define K14 0x8F1BBCDC8F1BBCDC
#define K15 0xA953FD4EA953FD4E
#define K21 0x50A28BE650A28BE6
#define K22 0x5C4DD1245C4DD124
#define K23 0x6D703EF36D703EF3
#define K24 0x7A6D76E97A6D76E9
#define K25 0x0000000000000000
// RIPEMD-160 4 way
@@ -44,7 +47,7 @@ static const uint32_t IV[5] =
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi32( k ) ), s ), e ); \
m128_const1_64( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
} while (0)
@@ -248,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
void ripemd160_4way_init( ripemd160_4way_context *sc )
{
sc->val[0] = _mm_set1_epi32( IV[0] );
sc->val[1] = _mm_set1_epi32( IV[1] );
sc->val[2] = _mm_set1_epi32( IV[2] );
sc->val[3] = _mm_set1_epi32( IV[3] );
sc->val[4] = _mm_set1_epi32( IV[4] );
sc->val[0] = m128_const1_64( 0x6745230167452301 );
sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m128_const1_64( 0x1032547610325476 );
sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -343,7 +346,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
do{ \
a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( a, f( b ,c, d ) ), r ), \
_mm256_set1_epi32( k ) ), s ), e ); \
m256_const1_64( k ) ), s ), e ); \
c = mm256_rol_32( c, 10 );\
} while (0)
@@ -548,11 +551,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
void ripemd160_8way_init( ripemd160_8way_context *sc )
{
sc->val[0] = _mm256_set1_epi32( IV[0] );
sc->val[1] = _mm256_set1_epi32( IV[1] );
sc->val[2] = _mm256_set1_epi32( IV[2] );
sc->val[3] = _mm256_set1_epi32( IV[3] );
sc->val[4] = _mm256_set1_epi32( IV[4] );
sc->val[0] = m256_const1_64( 0x6745230167452301 );
sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m256_const1_64( 0x1032547610325476 );
sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}

View File

@@ -1089,13 +1089,13 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_neoscrypt;
gate->hash = (void*)&neoscrypt;
gate->get_max64 = (void*)&get_neoscrypt_max64;
gate->set_target = (void*)&scrypt_set_target;
gate->wait_for_diff = (void*)&neoscrypt_wait_for_diff;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
gate->get_work_data_size = (void*)&neoscrypt_get_work_data_size;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -503,8 +503,8 @@ bool register_pluck_algo( algo_gate_t* gate )
gate->miner_thread_init = (void*)&pluck_miner_thread_init;
gate->scanhash = (void*)&scanhash_pluck;
gate->hash = (void*)&pluck_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&pluck_get_max64;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -698,8 +698,8 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
@@ -783,13 +783,18 @@ bool register_scrypt_algo( algo_gate_t* gate )
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
// gate->hash = (void*)&scrypt_1024_1_1_256_24way;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&scrypt_get_max64;
opt_target_factor = 65536.0;
if ( !opt_scrypt_n )
if ( !opt_param_n )
{
opt_param_n = 1024;
scratchbuf_size = 1024;
}
else
scratchbuf_size = opt_scrypt_n;
scratchbuf_size = opt_param_n;
applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n );
return true;
};

View File

@@ -55,6 +55,7 @@ typedef uint32_t scrypt_mix_word_t;
#include "scrypt-jane-romix-template.h"
#endif
/* cpu agnostic */
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
#define SCRYPT_MIX_FN chacha_core_basic

View File

@@ -1,9 +1,11 @@
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
/*
#if defined(SCRYPT_CHOOSE_COMPILETIME)
#undef SCRYPT_ROMIX_FN
#define SCRYPT_ROMIX_FN scrypt_ROMix
#endif
*/
#undef SCRYPT_HAVE_ROMIX
#define SCRYPT_HAVE_ROMIX

View File

@@ -240,24 +240,24 @@ bool register_scryptjane_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_scryptjane;
gate->hash = (void*)&scryptjanehash;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&get_max64_0x40LL;
opt_target_factor = 65536.0;
// figure out if arg in N or Nfactor
if ( !opt_scrypt_n )
if ( !opt_param_n )
{
applog( LOG_ERR, "The N factor must be specified in the form algo:nf");
return false;
}
else if ( opt_scrypt_n < 32 )
else if ( opt_param_n < 32 )
{
// arg is Nfactor, calculate N
sj_N = 1 << ( opt_scrypt_n + 1 );
sj_N = 1 << ( opt_param_n + 1 );
}
else
{
// arg is N
sj_N = opt_scrypt_n;
sj_N = opt_param_n;
}
return true;
}

View File

@@ -55,32 +55,13 @@ typedef struct {
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_4way_context;
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
/*
// SHA-256 7 way hybrid
// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
typedef struct {
__m128i bufx[64>>2];
__m128i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_7way_context;
void sha256_7way_init( sha256_7way_context *ctx );
void sha256_7way( sha256_7way_context *ctx, const void *datax,
void *datay, void *dataz, size_t len );
void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
void *dstz );
*/
#if defined (__AVX2__)
// SHA-256 8 way
@@ -89,6 +70,7 @@ typedef struct {
__m256i buf[64>>2];
__m256i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_8way_context;
void sha256_8way_init( sha256_8way_context *sc );
@@ -103,6 +85,7 @@ typedef struct {
__m256i buf[128>>3];
__m256i val[8];
uint64_t count;
bool initialized;
} sha512_4way_context;
void sha512_4way_init( sha512_4way_context *sc);

View File

@@ -12,6 +12,7 @@
#include <string.h>
#include <inttypes.h>
#include <openssl/sha.h>
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
#define EXTERN_SHA256
@@ -197,7 +198,17 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16];
#if defined(__SHA__)
SHA256_CTX ctx;
SHA256_Init( &ctx );
SHA256_Update( &ctx, data, len );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
#else
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
@@ -218,6 +229,7 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
sha256_transform(T, S, 0);
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
#endif
}
static inline void sha256d_preextend(uint32_t *W)
@@ -635,9 +647,46 @@ int scanhash_sha256d( struct work *work,
return 0;
}
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(64) data[20];
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
memcpy( data, pdata, 80 );
do {
data[19] = ++n;
sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 );
if ( unlikely( swab32( hash[7] ) <= Htarg ) )
{
pdata[19] = n;
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
} while ( likely( n < max_nonce && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_sha256d;
#if defined(__SHA__)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_SHA256d;
#else
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256d;
#endif
gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -34,19 +34,18 @@
#include <stddef.h>
#include <string.h>
#include "sha2-hash-4way.h"
#include <stdio.h>
#include "sha-hash-4way.h"
// SHA-256 32 bit
/*
static const sph_u32 H256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
};
*/
static const sph_u32 K256[64] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
@@ -113,16 +112,17 @@ static const sph_u32 K256[64] = {
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
register __m128i T1, T2; \
__m128i T1, T2; \
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
_mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
K, W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_4way_round( __m128i *in, __m128i r[8] )
sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
{
register __m128i A, B, C, D, E, F, G, H;
__m128i W[16];
@@ -130,14 +130,28 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
mm128_block_bswap_32( W, in );
mm128_block_bswap_32( W+8, in+8 );
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = m128_const1_64( 0x6A09E6676A09E667 );
B = m128_const1_64( 0xBB67AE85BB67AE85 );
C = m128_const1_64( 0x3C6EF3723C6EF372 );
D = m128_const1_64( 0xA54FF53AA54FF53A );
E = m128_const1_64( 0x510E527F510E527F );
F = m128_const1_64( 0x9B05688C9B05688C );
G = m128_const1_64( 0x1F83D9AB1F83D9AB );
H = m128_const1_64( 0x5BE0CD195BE0CD19 );
}
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
@@ -193,19 +207,36 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
r[0] = _mm_add_epi32( r[0], A );
r[1] = _mm_add_epi32( r[1], B );
r[2] = _mm_add_epi32( r[2], C );
r[3] = _mm_add_epi32( r[3], D );
r[4] = _mm_add_epi32( r[4], E );
r[5] = _mm_add_epi32( r[5], F );
r[6] = _mm_add_epi32( r[6], G );
r[7] = _mm_add_epi32( r[7], H );
if ( ctx->initialized )
{
r[0] = _mm_add_epi32( r[0], A );
r[1] = _mm_add_epi32( r[1], B );
r[2] = _mm_add_epi32( r[2], C );
r[3] = _mm_add_epi32( r[3], D );
r[4] = _mm_add_epi32( r[4], E );
r[5] = _mm_add_epi32( r[5], F );
r[6] = _mm_add_epi32( r[6], G );
r[7] = _mm_add_epi32( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) );
r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) );
r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) );
r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) );
r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) );
r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) );
r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) );
r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) );
}
}
void sha256_4way_init( sha256_4way_context *sc )
{
sc->initialized = false;
sc->count_high = sc->count_low = 0;
/*
sc->val[0] = _mm_set1_epi32( H256[0] );
sc->val[1] = _mm_set1_epi32( H256[1] );
sc->val[2] = _mm_set1_epi32( H256[2] );
@@ -214,6 +245,7 @@ void sha256_4way_init( sha256_4way_context *sc )
sc->val[5] = _mm_set1_epi32( H256[5] );
sc->val[6] = _mm_set1_epi32( H256[6] );
sc->val[7] = _mm_set1_epi32( H256[7] );
*/
}
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
@@ -237,7 +269,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
sha256_4way_round( sc->buf, sc->val );
sha256_4way_round( sc, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -256,13 +288,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_4way_round( sc->buf, sc->val );
sha256_4way_round( sc, sc->buf, sc->val );
memset_zero_128( sc->buf, pad >> 2 );
}
else
@@ -276,7 +308,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
mm128_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] =
mm128_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc->buf, sc->val );
sha256_4way_round( sc, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
}
@@ -313,16 +345,17 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
register __m256i T1, T2; \
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
__m256i T1, T2; \
__m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \
T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
K, W[i] ) ); \
T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_8way_round( __m256i *in, __m256i r[8] )
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
{
register __m256i A, B, C, D, E, F, G, H;
__m256i W[16];
@@ -330,14 +363,28 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
mm256_block_bswap_32( W , in );
mm256_block_bswap_32( W+8, in+8 );
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = m256_const1_64( 0x6A09E6676A09E667 );
B = m256_const1_64( 0xBB67AE85BB67AE85 );
C = m256_const1_64( 0x3C6EF3723C6EF372 );
D = m256_const1_64( 0xA54FF53AA54FF53A );
E = m256_const1_64( 0x510E527F510E527F );
F = m256_const1_64( 0x9B05688C9B05688C );
G = m256_const1_64( 0x1F83D9AB1F83D9AB );
H = m256_const1_64( 0x5BE0CD195BE0CD19 );
}
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
@@ -393,20 +440,36 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
r[0] = _mm256_add_epi32( r[0], A );
r[1] = _mm256_add_epi32( r[1], B );
r[2] = _mm256_add_epi32( r[2], C );
r[3] = _mm256_add_epi32( r[3], D );
r[4] = _mm256_add_epi32( r[4], E );
r[5] = _mm256_add_epi32( r[5], F );
r[6] = _mm256_add_epi32( r[6], G );
r[7] = _mm256_add_epi32( r[7], H );
if ( ctx->initialized )
{
r[0] = _mm256_add_epi32( r[0], A );
r[1] = _mm256_add_epi32( r[1], B );
r[2] = _mm256_add_epi32( r[2], C );
r[3] = _mm256_add_epi32( r[3], D );
r[4] = _mm256_add_epi32( r[4], E );
r[5] = _mm256_add_epi32( r[5], F );
r[6] = _mm256_add_epi32( r[6], G );
r[7] = _mm256_add_epi32( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) );
r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) );
r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) );
r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) );
r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) );
r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) );
r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) );
r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) );
}
}
void sha256_8way_init( sha256_8way_context *sc )
{
sc->initialized = false;
sc->count_high = sc->count_low = 0;
/*
sc->val[0] = _mm256_set1_epi32( H256[0] );
sc->val[1] = _mm256_set1_epi32( H256[1] );
sc->val[2] = _mm256_set1_epi32( H256[2] );
@@ -415,6 +478,7 @@ void sha256_8way_init( sha256_8way_context *sc )
sc->val[5] = _mm256_set1_epi32( H256[5] );
sc->val[6] = _mm256_set1_epi32( H256[6] );
sc->val[7] = _mm256_set1_epi32( H256[7] );
*/
}
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
@@ -438,7 +502,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
sha256_8way_round( sc->buf, sc->val );
sha256_8way_round( sc, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -457,13 +521,13 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_8way_round( sc->buf, sc->val );
sha256_8way_round( sc, sc->buf, sc->val );
memset_zero_256( sc->buf, pad >> 2 );
}
else
@@ -478,207 +542,10 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
sc->buf[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( low ) );
sha256_8way_round( sc->buf, sc->val );
sha256_8way_round( sc, sc->buf, sc->val );
mm256_block_bswap_32( dst, sc->val );
}
// SHA-512 4 way 64 bit
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
#define BSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
#define BSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) )
#define SSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
register __m256i T1, T2; \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
_mm256_set1_epi64x( K512[i] ), W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
static void
sha512_4way_round( __m256i *in, __m256i r[8] )
{
int i;
register __m256i A, B, C, D, E, F, G, H;
__m256i W[80];
mm256_block_bswap_64( W , in );
mm256_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ],
SSG5_0( W[ i-15 ] ), W[ i-16 ] );
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
for ( i = 0; i < 80; i += 8 )
{
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
r[0] = _mm256_add_epi64( r[0], A );
r[1] = _mm256_add_epi64( r[1], B );
r[2] = _mm256_add_epi64( r[2], C );
r[3] = _mm256_add_epi64( r[3], D );
r[4] = _mm256_add_epi64( r[4], E );
r[5] = _mm256_add_epi64( r[5], F );
r[6] = _mm256_add_epi64( r[6], G );
r[7] = _mm256_add_epi64( r[7], H );
}
void sha512_4way_init( sha512_4way_context *sc )
{
sc->count = 0;
sc->val[0] = _mm256_set1_epi64x( H512[0] );
sc->val[1] = _mm256_set1_epi64x( H512[1] );
sc->val[2] = _mm256_set1_epi64x( H512[2] );
sc->val[3] = _mm256_set1_epi64x( H512[3] );
sc->val[4] = _mm256_set1_epi64x( H512[4] );
sc->val[5] = _mm256_set1_epi64x( H512[5] );
sc->val[6] = _mm256_set1_epi64x( H512[6] );
sc->val[7] = _mm256_set1_epi64x( H512[7] );
}
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
const int buf_size = 128;
ptr = (unsigned)sc->count & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha512_4way_round( sc->buf, sc->val );
ptr = 0;
}
sc->count += clen;
}
}
void sha512_4way_close( sha512_4way_context *sc, void *dst )
{
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
sha512_4way_round( sc->buf, sc->val );
memset_zero_256( sc->buf, pad >> 3 );
}
else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );
}
#endif // __AVX2__
#endif // __SSE2__

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha2-hash-4way.h"
#include "sha-hash-4way.h"
#if defined(SHA256T_8WAY)

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha2-hash-4way.h"
#include "sha-hash-4way.h"
#if defined(SHA256T_11WAY)
@@ -158,7 +158,7 @@ void sha256t_8way_hash( void* output, const void* input )
sha256_8way_close( &ctx, output );
}
int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
@@ -166,12 +166,12 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
const uint64_t htmax[] = { 0,
0xF,
@@ -194,7 +194,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
const uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32(
@@ -244,7 +244,7 @@ void sha256t_4way_hash( void* output, const void* input )
sha256_4way_close( &ctx, output );
}
int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -252,12 +252,12 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
const uint64_t htmax[] = { 0,
0xF,
@@ -278,7 +278,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
const uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;

320
algo/sha/sha512-hash-4way.c Normal file
View File

@@ -0,0 +1,320 @@
/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
/*
* SHA-384 / SHA-512 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined(__AVX2__)
#include <stddef.h>
#include <string.h>
#include "sha-hash-4way.h"
// SHA-512 4 way 64 bit
/*
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
*/
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
#define BSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
#define BSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) )
#define SSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
// Interleave SSG0 & SSG1 for better throughput.
// return ssg0(w0) + ssg1(w1)
static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
{
__m256i w0a, w1a, w0b, w1b;
w0a = mm256_ror_64( w0, 1 );
w1a = mm256_ror_64( w1,19 );
w0b = mm256_ror_64( w0, 8 );
w1b = mm256_ror_64( w1,61 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
w0b = _mm256_srli_epi64( w0, 7 );
w1b = _mm256_srli_epi64( w1, 6 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
return _mm256_add_epi64( w0a, w1a );
}
#define SSG512x2_0( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-15], 1 ); \
X1a = mm256_ror_64( W[i-14], 1 ); \
X0b = mm256_ror_64( W[i-15], 8 ); \
X1b = mm256_ror_64( W[i-14], 8 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-15], 7 ); \
X1b = _mm256_srli_epi64( W[i-14], 7 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
#define SSG512x2_1( w0, w1, i ) do \
{ \
__m256i X0a, X1a, X0b, X1b; \
X0a = mm256_ror_64( W[i-2],19 ); \
X1a = mm256_ror_64( W[i-1],19 ); \
X0b = mm256_ror_64( W[i-2],61 ); \
X1b = mm256_ror_64( W[i-1],61 ); \
X0a = _mm256_xor_si256( X0a, X0b ); \
X1a = _mm256_xor_si256( X1a, X1b ); \
X0b = _mm256_srli_epi64( W[i-2], 6 ); \
X1b = _mm256_srli_epi64( W[i-1], 6 ); \
w0 = _mm256_xor_si256( X0a, X0b ); \
w1 = _mm256_xor_si256( X1a, X1b ); \
} while(0)
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i T1, T2; \
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
K, W[i] ) ); \
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{
int i;
register __m256i A, B, C, D, E, F, G, H;
__m256i W[80];
mm256_block_bswap_64( W , in );
mm256_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
_mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = m256_const1_64( 0x6A09E667F3BCC908 );
B = m256_const1_64( 0xBB67AE8584CAA73B );
C = m256_const1_64( 0x3C6EF372FE94F82B );
D = m256_const1_64( 0xA54FF53A5F1D36F1 );
E = m256_const1_64( 0x510E527FADE682D1 );
F = m256_const1_64( 0x9B05688C2B3E6C1F );
G = m256_const1_64( 0x1F83D9ABFB41BD6B );
H = m256_const1_64( 0x5BE0CD19137E2179 );
}
for ( i = 0; i < 80; i += 8 )
{
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
if ( ctx->initialized )
{
r[0] = _mm256_add_epi64( r[0], A );
r[1] = _mm256_add_epi64( r[1], B );
r[2] = _mm256_add_epi64( r[2], C );
r[3] = _mm256_add_epi64( r[3], D );
r[4] = _mm256_add_epi64( r[4], E );
r[5] = _mm256_add_epi64( r[5], F );
r[6] = _mm256_add_epi64( r[6], G );
r[7] = _mm256_add_epi64( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
}
}
void sha512_4way_init( sha512_4way_context *sc )
{
sc->initialized = false;
sc->count = 0;
/*
sc->val[0] = _mm256_set1_epi64x( H512[0] );
sc->val[1] = _mm256_set1_epi64x( H512[1] );
sc->val[2] = _mm256_set1_epi64x( H512[2] );
sc->val[3] = _mm256_set1_epi64x( H512[3] );
sc->val[4] = _mm256_set1_epi64x( H512[4] );
sc->val[5] = _mm256_set1_epi64x( H512[5] );
sc->val[6] = _mm256_set1_epi64x( H512[6] );
sc->val[7] = _mm256_set1_epi64x( H512[7] );
*/
}
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
const int buf_size = 128;
ptr = (unsigned)sc->count & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha512_4way_round( sc, sc->buf, sc->val );
ptr = 0;
}
sc->count += clen;
}
}
void sha512_4way_close( sha512_4way_context *sc, void *dst )
{
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
sha512_4way_round( sc, sc->buf, sc->val );
memset_zero_256( sc->buf, pad >> 3 );
}
else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] =
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
sha512_4way_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );
}
#endif // __AVX2__

View File

@@ -63,7 +63,6 @@ extern "C"{
* that it can optimize them at will.
*/
/* BEGIN -- automatically generated code. */
#define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
@@ -76,8 +75,11 @@ extern "C"{
M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 Wlow, Whigh;
#define READ_STATE(state) do { \
A00 = (state)->A[0]; \
#define READ_STATE(state) do \
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
@@ -121,9 +123,58 @@ extern "C"{
CD = (state)->C[13]; \
CE = (state)->C[14]; \
CF = (state)->C[15]; \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
} while (0)
} \
else \
{ \
(state)->state_loaded = true; \
A00 = m128_const1_64( 0x20728DFD20728DFD ); \
A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m128_const1_64( 0xE782B699E782B699 ); \
A03 = m128_const1_64( 0x5530463255304632 ); \
A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m128_const1_64( 0x8BD144108BD14410 ); \
A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m128_const1_64( 0x73B9D31473B9D314 ); \
B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m128_const1_64( 0x48910A5A48910A5A ); \
B9 = m128_const1_64( 0x893B22DB893B22DB ); \
BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m128_const1_64( 0xBBC4324EBBC4324E ); \
BC = m128_const1_64( 0x72D2F24072D2F240 ); \
BD = m128_const1_64( 0x75941D9975941D99 ); \
BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m128_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m128_const1_64( 0x58BAD75058BAD750 ); \
C2 = m128_const1_64( 0x56028CB256028CB2 ); \
C3 = m128_const1_64( 0x8134F3598134F359 ); \
C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m128_const1_64( 0x0405278004052780 ); \
C8 = m128_const1_64( 0x7F07D7877F07D787 ); \
C9 = m128_const1_64( 0x5194358F5194358F ); \
CA = m128_const1_64( 0x3C60D6653C60D665 ); \
CB = m128_const1_64( 0xBE97D79ABE97D79A ); \
CC = m128_const1_64( 0x950C3434950C3434 ); \
CD = m128_const1_64( 0xAED9A06DAED9A06D ); \
CE = m128_const1_64( 0x2537DC8D2537DC8D ); \
CF = m128_const1_64( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
} while (0)
#define WRITE_STATE(state) do { \
(state)->A[0] = A00; \
@@ -397,6 +448,7 @@ do { \
Whigh = T32(Whigh + 1); \
} while (0)
/*
static const sph_u32 A_init_256[] = {
C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
@@ -436,33 +488,115 @@ static const sph_u32 C_init_512[] = {
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
};
*/
static void
shabal_4way_init( void *cc, unsigned size )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
int i;
if ( size == 512 )
{
for ( i = 0; i < 12; i++ )
sc->A[i] = _mm_set1_epi32( A_init_512[i] );
for ( i = 0; i < 16; i++ )
{
sc->B[i] = _mm_set1_epi32( B_init_512[i] );
sc->C[i] = _mm_set1_epi32( C_init_512[i] );
}
{ // copy immediate constants directly to working registers later.
sc->state_loaded = false;
/*
sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD );
sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 );
sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 );
sc->A[ 3] = m128_const1_64( 0x5530463255304632 );
sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 );
sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C );
sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 );
sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B );
sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 );
sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 );
sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC );
sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F );
sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 );
sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 );
sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 );
sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 );
sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 );
sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 );
sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 );
sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 );
sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A );
sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB );
sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 );
sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E );
sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 );
sc->B[13] = m128_const1_64( 0x75941D9975941D99 );
sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 );
sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B );
sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 );
sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 );
sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 );
sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 );
sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 );
sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 );
sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E );
sc->C[ 7] = m128_const1_64( 0x0405278004052780 );
sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 );
sc->C[ 9] = m128_const1_64( 0x5194358F5194358F );
sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 );
sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A );
sc->C[12] = m128_const1_64( 0x950C3434950C3434 );
sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D );
sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D );
sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 );
*/
}
else
{
for ( i = 0; i < 12; i++ )
sc->A[i] = _mm_set1_epi32( A_init_256[i] );
for ( i = 0; i < 16; i++ )
{
sc->B[i] = _mm_set1_epi32( B_init_256[i] );
sc->C[i] = _mm_set1_epi32( C_init_256[i] );
}
}
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A );
sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m128_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m128_const1_64( 0xBE216306BE216306 );
sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m128_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 );
sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m128_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m128_const1_64( 0xBC968828BC968828 );
sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m128_const1_64( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;
sc->ptr = 0;
@@ -488,6 +622,8 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
sc->ptr = ptr;
return;
}
READ_STATE(sc);
while ( len > 0 )

View File

@@ -54,7 +54,8 @@ typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i A[12], B[16], C[16];
sph_u32 Whigh, Wlow;
size_t ptr;
size_t ptr;
bool state_loaded;
} shabal_4way_context;
typedef shabal_4way_context shabal256_4way_context;

View File

@@ -5,7 +5,7 @@
#if defined(__SHA__)
#include <openssl/sha.h>
#else
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#endif
#if defined (SKEIN_4WAY)

View File

@@ -415,18 +415,46 @@ do { \
sc->bcount = bcount; \
} while (0)
/*
static const sph_u64 IV256[] = {
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
};
static void
skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv )
static const sph_u64 IV512[] = {
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
};
*/
void skein256_4way_init( skein256_4way_context *sc )
{
sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] );
sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] );
sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] );
sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] );
sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] );
sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] );
sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] );
sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] );
sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 );
sc->h1 = m256_const1_64( 0xE83590301A79A9EB );
sc->h2 = m256_const1_64( 0x55AEA0614F816E6F );
sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB );
sc->h4 = m256_const1_64( 0xEC06025E74DD7683 );
sc->h5 = m256_const1_64( 0xE7A436CDC4746251 );
sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 );
sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 );
sc->bcount = 0;
sc->ptr = 0;
}
void skein512_4way_init( skein512_4way_context *sc )
{
sc->h0 = m256_const1_64( 0x4903ADFF749C51CE );
sc->h1 = m256_const1_64( 0x0D95DE399746DF03 );
sc->h2 = m256_const1_64( 0x8FD1934127C79BCE );
sc->h3 = m256_const1_64( 0x9A255629FF352CB1 );
sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
sc->h6 = m256_const1_64( 0x991112C71A75B523 );
sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 );
sc->bcount = 0;
sc->ptr = 0;
}
@@ -524,6 +552,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
memcpy_256( dst, buf, out_len >> 3 );
}
/*
static const sph_u64 IV256[] = {
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
@@ -537,13 +566,14 @@ static const sph_u64 IV512[] = {
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
};
*/
/*
void
skein256_4way_init(void *cc)
{
skein_big_init_4way(cc, IV256);
}
*/
void
skein256_4way(void *cc, const void *data, size_t len)
@@ -557,11 +587,13 @@ skein256_4way_close(void *cc, void *dst)
skein_big_close_4way(cc, 0, 0, dst, 32);
}
/*
void
skein512_4way_init(void *cc)
{
skein_big_init_4way(cc, IV512);
}
*/
void
skein512_4way(void *cc, const void *data, size_t len)

View File

@@ -55,25 +55,26 @@ extern "C"{
#define SPH_SIZE_skein256 256
#define SPH_SIZE_skein512 512
typedef struct {
__m256i buf[8] __attribute__ ((aligned (32)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
typedef struct
{
__m256i buf[8] __attribute__ ((aligned (64)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} sph_skein_4way_big_context;
typedef sph_skein_4way_big_context skein512_4way_context;
typedef sph_skein_4way_big_context skein256_4way_context;
void skein512_4way_init(void *cc);
void skein512_4way(void *cc, const void *data, size_t len);
void skein512_4way_close(void *cc, void *dst);
void skein512_4way_init( skein512_4way_context *sc );
void skein512_4way( void *cc, const void *data, size_t len );
void skein512_4way_close( void *cc, void *dst );
//void sph_skein512_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
void skein256_4way_init(void *cc);
void skein256_4way(void *cc, const void *data, size_t len);
void skein256_4way_close(void *cc, void *dst);
void skein256_4way_init( skein256_4way_context *sc );
void skein256_4way( void *cc, const void *data, size_t len );
void skein256_4way_close( void *cc, void *dst );
//void sph_skein256_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);

View File

@@ -200,6 +200,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
T = _mm_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
// AVX512 _mm_rol_epi32 doesn't like using a variable for the second arg.
SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
mm128_rol_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );

View File

@@ -120,19 +120,13 @@ int scanhash_fresh( struct work *work,
return 0;
}
void fresh_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_fresh_algo( algo_gate_t* gate )
{
algo_not_tested();
gate->scanhash = (void*)&scanhash_fresh;
gate->hash = (void*)&freshhash;
gate->set_target = (void*)&fresh_set_target;
gate->get_max64 = (void*)&get_max64_0x3ffff;
opt_target_factor = 256.0;
return true;
};

View File

@@ -1,10 +1,5 @@
#include "timetravel-gate.h"
void tt8_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL_4WAY
@@ -16,9 +11,9 @@ bool register_timetravel_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
#endif
gate->set_target = (void*)&tt8_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -1,10 +1,5 @@
#include "timetravel10-gate.h"
void tt10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL10_4WAY
@@ -16,9 +11,9 @@ bool register_timetravel10_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
#endif
gate->set_target = (void*)&tt10_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
opt_target_factor = 256.0;
return true;
};

View File

@@ -249,7 +249,6 @@ bool register_drop_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_drop;
gate->hash = (void*)&droplp_hash_pok;
gate->get_new_work = (void*)&drop_get_new_work;
gate->set_target = (void*)&scrypt_set_target;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
@@ -257,6 +256,7 @@ bool register_drop_algo( algo_gate_t* gate )
gate->decode_extra_data = (void*)&drop_display_pok;
gate->get_work_data_size = (void*)&drop_get_work_data_size;
gate->work_cmp_size = 72;
opt_target_factor = 65536.0;
return true;
};

283
algo/x13/x13bcd-4way.c Normal file
View File

@@ -0,0 +1,283 @@
#include "x13sm3-gate.h"
#if defined(X13SM3_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
//#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sm3-hash-4way.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
// luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
sm3_4way_ctx_t sm3;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
} x13bcd_4way_ctx_holder;
x13bcd_4way_ctx_holder x13bcd_4way_ctx __attribute__ ((aligned (64)));
static __thread blake512_4way_context x13bcd_ctx_mid;
void init_x13bcd_4way_ctx()
{
blake512_4way_init( &x13bcd_4way_ctx.blake );
bmw512_4way_init( &x13bcd_4way_ctx.bmw );
init_groestl( &x13bcd_4way_ctx.groestl, 64 );
skein512_4way_init( &x13bcd_4way_ctx.skein );
jh512_4way_init( &x13bcd_4way_ctx.jh );
keccak512_4way_init( &x13bcd_4way_ctx.keccak );
// luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 );
cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13bcd_4way_ctx.shavite );
simd_2way_init( &x13bcd_4way_ctx.simd, 512 );
init_echo( &x13bcd_4way_ctx.echo, 512 );
sm3_4way_init( &x13bcd_4way_ctx.sm3 );
hamsi512_4way_init( &x13bcd_4way_ctx.hamsi );
sph_fugue512_init( &x13bcd_4way_ctx.fugue );
};
void x13bcd_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x13bcd_4way_ctx_holder ctx;
memcpy( &ctx, &x13bcd_4way_ctx, sizeof(x13bcd_4way_ctx) );
// Blake
memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 );
// blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
memset( sm3_hash1, 0, sizeof sm3_hash1 );
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
memset( sm3_hash2, 0, sizeof sm3_hash2 );
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
/*
// Luffa
intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
*/
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x13bcd_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// Simd
intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x13bcd_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
/*
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
memset( sm3_hash1, 0, sizeof sm3_hash1 );
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
memset( sm3_hash2, 0, sizeof sm3_hash2 );
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
*/
// Hamsi parallel 4x32x2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
mm256_bswap32_intrlv80_4x64( vdata, pdata );
blake512_4way_init( &x13bcd_ctx_mid );
blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13bcd_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

258
algo/x13/x13bcd.c Normal file
View File

@@ -0,0 +1,258 @@
#include "x13sm3-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/groestl/sph_groestl.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/sm3/sph_sm3.h"
//#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#ifndef NO_AES_NI
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
hashState_echo echo;
hashState_groestl groestl;
#endif
// hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sm3_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
} x13bcd_ctx_holder;
x13bcd_ctx_holder x13bcd_ctx;
void init_x13bcd_ctx()
{
#ifdef NO_AES_NI
sph_groestl512_init(&x13bcd_ctx.groestl);
sph_echo512_init(&x13bcd_ctx.echo);
#else
init_echo(&x13bcd_ctx.echo, 512);
init_groestl(&x13bcd_ctx.groestl, 64 );
#endif
// init_luffa(&x13bcd_ctx.luffa,512);
cubehashInit(&x13bcd_ctx.cube,512,16,32);
sph_shavite512_init(&x13bcd_ctx.shavite);
init_sd(&x13bcd_ctx.simd,512);
sm3_init( &x13bcd_ctx.sm3 );
sph_hamsi512_init(&x13bcd_ctx.hamsi);
sph_fugue512_init(&x13bcd_ctx.fugue);
};
void x13bcd_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
x13bcd_ctx_holder ctx;
memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx));
unsigned char hashbuf[128];
size_t hashptr;
sph_u64 hashctA;
sph_u64 hashctB;
//---blake1---
DECL_BLK;
BLK_I;
BLK_W;
BLK_C;
//---bmw2---
DECL_BMW;
BMW_I;
BMW_U;
#define M(x) sph_dec64le_aligned(data + 8 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
BMW_C;
#undef M
#undef H
#undef dH
//---groestl----
#ifdef NO_AES_NI
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#endif
//---skein4---
DECL_SKN;
SKN_I;
SKN_U;
SKN_C;
//---jh5------
DECL_JH;
JH_H;
//---keccak6---
DECL_KEC;
KEC_I;
KEC_U;
KEC_C;
uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
memset(sm3_hash, 0, sizeof sm3_hash);
sph_sm3(&ctx.sm3, hash, 64);
sph_sm3_close(&ctx.sm3, sm3_hash);
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)sm3_hash, 64 );
/*
//--- luffa7
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)hash, 64 );
*/
// 9 Shavite
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
//11---echo---
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#else
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
/*
uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
memset(sm3_hash, 0, sizeof sm3_hash);
sph_sm3(&ctx.sm3, hash, 64);
sph_sm3_close(&ctx.sm3, sm3_hash);
sph_hamsi512(&ctx.hamsi, sm3_hash, 64);
*/
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
asm volatile ("emms");
memcpy(output, hash, 32);
}
int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
#ifdef DEBUG_ALGO
if (Htarg != 0)
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for (int m=0; m < 6; m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x13bcd_hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -16,3 +16,19 @@ bool register_x13sm3_algo( algo_gate_t* gate )
return true;
};
bool register_x13bcd_algo( algo_gate_t* gate )
{
#if defined (X13SM3_4WAY)
init_x13bcd_4way_ctx();
gate->scanhash = (void*)&scanhash_x13bcd_4way;
gate->hash = (void*)&x13bcd_4way_hash;
#else
init_x13bcd_ctx();
gate->scanhash = (void*)&scanhash_x13bcd;
gate->hash = (void*)&x13bcd_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -10,23 +10,31 @@
bool register_x13sm3_algo( algo_gate_t* gate );
bool register_x13bcd_algo( algo_gate_t* gate );
#if defined(X13SM3_4WAY)
void x13sm3_4way_hash( void *state, const void *input );
int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13sm3_4way_ctx();
void x13bcd_4way_hash( void *state, const void *input );
int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13bcd_4way_ctx();
#endif
void x13sm3_hash( void *state, const void *input );
int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13sm3_ctx();
void x13bcd_hash( void *state, const void *input );
int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13bcd_ctx();
#endif

247
algo/x16/hex.c Normal file
View File

@@ -0,0 +1,247 @@
/**
* x16r algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include <openssl/sha.h>
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
static void hex_getAlgoString(const uint32_t* prevblock, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)prevblock;
for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
union _hex_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
};
typedef union _hex_context_overlay hex_context_overlay;
void hex_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
hex_context_overlay ctx;
void *in = (void*) input;
int size = 80;
/*
if ( s_ntime == UINT32_MAX )
{
const uint8_t* in8 = (uint8_t*) input;
x16_r_s_getAlgoString( &in8[4], hashOrder );
}
*/
char elem = hashOrder[0];
uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
for ( int i = 0; i < 16; i++ )
{
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, in, size );
sph_skein512_close( &ctx.skein, hash );
break;
case JH:
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512_close(&ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
sph_keccak512_close( &ctx.keccak, hash );
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in, size );
sph_shavite512_close( &ctx.shavite, hash );
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, in, size );
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in, size );
sph_fugue512_close( &ctx.fugue, hash );
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, in, size );
sph_shabal512_close( &ctx.shabal, hash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in, size );
sph_whirlpool_close( &ctx.whirlpool, hash );
break;
case SHA_512:
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, in, size );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
break;
}
algo = (uint8_t)hash[0] % X16R_HASH_FUNC_COUNT;
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
int scanhash_hex( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint32_t ntime = swab32(pdata[17]);
if ( s_ntime != ntime )
{
hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do
{
be32enc( &endiandata[19], nonce );
hex_hash( hash32, endiandata );
if ( hash32[7] <= Htarg )
if (fulltest( hash32, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -27,7 +27,7 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
@@ -68,21 +68,7 @@ void x16r_4way_hash( void* output, const void* input )
int size = 80;
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
if ( s_ntime == UINT32_MAX )
{
const uint8_t* tmp = (uint8_t*) in0;
x16_r_s_getAlgoString( &tmp[4], hashOrder );
}
// Input data is both 64 bit interleaved (input)
// and deinterleaved in inp0-3.
// If First function uses 64 bit data it is not required to interleave inp
// first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
// All other functions assume data is deinterleaved in hash0-3
// All functions must exit with data deinterleaved in hash0-3.
// Alias in0-3 points to either inp0-3 or hash0-3 according to
// its hashOrder position. Size is also set accordingly.
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];

View File

@@ -42,8 +42,23 @@ bool register_x16r_algo( algo_gate_t* gate )
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
};
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
};
@@ -57,8 +72,189 @@ bool register_x16s_algo( algo_gate_t* gate )
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
};
////////////////
//
// X16RT
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
{
int32_t maskedTime = timeStamp & 0xffffff80;
sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
sizeof( maskedTime ) );
}
void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)timeHash;
for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
uchar merkle_tree[64] = { 0 };
size_t t;
algo_gate.gen_merkle_root( merkle_tree, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
// algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
// (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
// le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
int i;
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
if ( have_stratum )
for ( i = 0; i < 8; i++ )
g_work->data[ 1+i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
else
for (i = 0; i < 8; i++)
g_work->data[ 8-i ] = le32dec( (uint32_t*)sctx->job.prevhash + i );
g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
g_work->data[20] = 0x80000000;
g_work->data[31] = 0x00000280;
for ( i = 0; i < 8; i++ )
g_work->merkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
for ( i = 0; i < 8; i++ )
g_work->witmerkleroothash[7 - i] = be32dec((uint32_t *)merkle_tree + i);
for ( i = 0; i < 8; i++ )
g_work->denom10[i] = le32dec((uint32_t *)sctx->job.denom10 + i);
for ( i = 0; i < 8; i++ )
g_work->denom100[i] = le32dec((uint32_t *)sctx->job.denom100 + i);
for ( i = 0; i < 8; i++ )
g_work->denom1000[i] = le32dec((uint32_t *)sctx->job.denom1000 + i);
for ( i = 0; i < 8; i++ )
g_work->denom10000[i] = le32dec((uint32_t *)sctx->job.denom10000 + i);
uint32_t pofnhash[8];
memset(pofnhash, 0x00, 32);
char denom10_str [ 2 * sizeof( g_work->denom10 ) + 1 ];
char denom100_str [ 2 * sizeof( g_work->denom100 ) + 1 ];
char denom1000_str [ 2 * sizeof( g_work->denom1000 ) + 1 ];
char denom10000_str [ 2 * sizeof( g_work->denom10000 ) + 1 ];
char merkleroot_str [ 2 * sizeof( g_work->merkleroothash ) + 1 ];
char witmerkleroot_str[ 2 * sizeof( g_work->witmerkleroothash ) + 1 ];
char pofn_str [ 2 * sizeof( pofnhash ) + 1 ];
cbin2hex( denom10_str, (char*) g_work->denom10, 32 );
cbin2hex( denom100_str, (char*) g_work->denom100, 32 );
cbin2hex( denom1000_str, (char*) g_work->denom1000, 32 );
cbin2hex( denom10000_str, (char*) g_work->denom10000, 32 );
cbin2hex( merkleroot_str, (char*) g_work->merkleroothash, 32 );
cbin2hex( witmerkleroot_str, (char*) g_work->witmerkleroothash, 32 );
cbin2hex( pofn_str, (char*) pofnhash, 32 );
if ( true )
{
char* data;
data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
+ strlen( merkleroot_str ) * 3 );
// Build the block header veildatahash in hex
sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
merkleroot_str, witmerkleroot_str, "04",
"0a00000000000000", denom10_str,
"6400000000000000", denom100_str,
"e803000000000000", denom1000_str,
"1027000000000000", denom10000_str, pofn_str );
// Covert the hex to binary
uint32_t test[100];
hex2bin( (unsigned char*)(&test), data, 257);
// Compute the sha256d of the binary
uint32_t _ALIGN(64) hash[8];
sha256d( (unsigned char*)hash, (unsigned char*)&(test), 257);
// assign the veildatahash in the blockheader
for ( i = 0; i < 8; i++ )
g_work->data[16 - i] = le32dec(hash + i);
free(data);
}
}
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16rt_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
opt_target_factor = 256.0;
return true;
};
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16rt_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
};
////////////////////
//
// HEX
bool register_hex_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&hex_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
opt_target_factor = 128.0;
return true;
};
///////////////////////////////
//
// X21S
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#else
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
};

View File

@@ -4,6 +4,7 @@
#include "algo-gate-api.h"
#include "simd-utils.h"
#include <stdint.h>
#include <unistd.h>
#if defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY
@@ -30,11 +31,18 @@ enum x16r_Algo {
};
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
void x16r_getAlgoString( const uint8_t* prevblock, char *output );
void x16s_getAlgoString( const uint8_t* prevblock, char *output );
void x16r_getAlgoString( const uint8_t *prevblock, char *output );
void x16s_getAlgoString( const uint8_t *prevblock, char *output );
void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );
bool register_x16r_algo( algo_gate_t* gate );
bool register_x16rv2_algo( algo_gate_t* gate );
bool register_x16s_algo( algo_gate_t* gate );
bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
#if defined(X16R_4WAY)
@@ -42,11 +50,41 @@ void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_4way_hash( void *state, const void *input );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x21s_4way_hash( void *state, const void *input );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#endif
void x16r_hash( void *state, const void *input );
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_hash( void *state, const void *input );
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void hex_hash( void *state, const void *input );
int scanhash_hex( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x21s_hash( void *state, const void *input );
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_thread_init();
#endif

View File

@@ -65,13 +65,13 @@ void x16r_hash( void* output, const void* input )
x16r_context_overlay ctx;
void *in = (void*) input;
int size = 80;
/*
if ( s_ntime == UINT32_MAX )
{
const uint8_t* in8 = (uint8_t*) input;
x16_r_s_getAlgoString( &in8[4], hashOrder );
}
*/
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];

353
algo/x16/x16rt-4way.c Normal file
View File

@@ -0,0 +1,353 @@
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha-hash-4way.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread bool s_implemented = false;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rt_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_echo echo;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
};
typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay;
void x16rt_4way_hash( void* output, const void* input )
{
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x16rt_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
/*
void *in = (void*) input;
uint32_t *in32 = (uint32_t*) hash0;
uint32_t ntime = in32[17];
if ( s_ntime == UINT32_MAX )
{
uint32_t _ALIGN(64) timeHash[8];
x16rt_getTimeHash(ntime, &timeHash);
x16rt_getAlgoString(&timeHash[0], hashOrder);
}
*/
// Input data is both 64 bit interleaved (input)
// and deinterleaved in inp0-3.
// If First function uses 64 bit data it is not required to interleave inp
// first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
// All other functions assume data is deinterleaved in hash0-3
// All functions must exit with data deinterleaved in hash0-3.
// Alias in0-3 points to either inp0-3 or hash0-3 according to
// its hashOrder position. Size is also set accordingly.
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
break;
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
if ( i == 0 )
keccak512_4way( &ctx.keccak, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
intrlv_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
break;
case SHA_512:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint32_t ntime = swab32( pdata[17] );
if ( s_ntime != ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], hashOrder );
s_ntime = ntime;
s_implemented = true;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
hashOrder, ntime, timeHash );
}
if ( !s_implemented )
{
applog( LOG_WARNING, "s not implemented");
sleep(1);
return 0;
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16rt_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

239
algo/x16/x16rt.c Normal file
View File

@@ -0,0 +1,239 @@
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include <openssl/sha.h>
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread bool s_implemented = false;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rt_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
};
typedef union _x16rt_context_overlay x16rt_context_overlay;
void x16rt_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
x16rt_context_overlay ctx;
int size = 80;
void *in = (void*) input;
/*
void *in = (void*) input;
uint32_t *in32 = (uint32_t*) in;
uint32_t ntime = in32[17];
if ( s_ntime == UINT32_MAX )
{
uint32_t _ALIGN(64) timeHash[8];
x16rt_getTimeHash(ntime, &timeHash);
x16rt_getAlgoString(&timeHash[0], hashOrder);
}
*/
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, in, size );
sph_skein512_close( &ctx.skein, hash );
break;
case JH:
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512_close(&ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
sph_keccak512_close( &ctx.keccak, hash );
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in, size );
sph_shavite512_close( &ctx.shavite, hash );
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, in, size );
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in, size );
sph_fugue512_close( &ctx.fugue, hash );
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, in, size );
sph_shabal512_close( &ctx.shabal, hash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in, size );
sph_whirlpool_close( &ctx.whirlpool, hash );
break;
case SHA_512:
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, in, size );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t _ALIGN(64) timeHash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint32_t ntime = swab32( pdata[17] );
if ( s_ntime != ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], hashOrder );
s_ntime = ntime;
s_implemented = true;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
hashOrder, ntime, timeHash );
}
if ( !s_implemented )
{
applog( LOG_WARNING, "s not implemented");
sleep(1);
return 0;
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do
{
be32enc( &endiandata[19], nonce );
x16rt_hash( hash32, endiandata );
if ( hash32[7] <= Htarg )
if (fulltest( hash32, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

384
algo/x16/x16rv2-4way.c Normal file
View File

@@ -0,0 +1,384 @@
/**
* x16r algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/tiger/sph_tiger.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rv2_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_echo echo;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
void x16rv2_4way_hash( void* output, const void* input )
{
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x16rv2_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
break;
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = 0;
intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
for ( int i = (24/4); i < (64/4); i++ )
hash2[i] = hash3[i] = 0;
intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
break;
case SHA_512:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16rv2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

247
algo/x16/x16rv2.c Normal file
View File

@@ -0,0 +1,247 @@
/**
* x16r algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include <openssl/sha.h>
#include "algo/tiger/sph_tiger.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rv2_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_context_overlay x16rv2_context_overlay;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512(uint32_t* hash) {
for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
}
void x16rv2_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
x16rv2_context_overlay ctx;
void *in = (void*) input;
int size = 80;
/*
if ( s_ntime == UINT32_MAX )
{
const uint8_t* in8 = (uint8_t*) input;
x16_r_s_getAlgoString( &in8[4], hashOrder );
}
*/
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, in, size );
sph_skein512_close( &ctx.skein, hash );
break;
case JH:
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512_close(&ctx.jh, hash );
break;
case KECCAK:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in, size );
sph_tiger_close( &ctx.tiger, hash );
padtiger512( hash );
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
break;
case LUFFA:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in, size );
sph_tiger_close( &ctx.tiger, hash );
padtiger512( hash );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in, size );
sph_shavite512_close( &ctx.shavite, hash );
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, in, size );
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in, size );
sph_fugue512_close( &ctx.fugue, hash );
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, in, size );
sph_shabal512_close( &ctx.shabal, hash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in, size );
sph_whirlpool_close( &ctx.whirlpool, hash );
break;
case SHA_512:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in, size );
sph_tiger_close( &ctx.tiger, hash );
padtiger512( hash );
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, hash, 64 );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do
{
be32enc( &endiandata[19], nonce );
x16rv2_hash( hash32, endiandata );
if ( hash32[7] <= Htarg )
if (fulltest( hash32, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

431
algo/x16/x21s-4way.c Normal file
View File

@@ -0,0 +1,431 @@
/**
* x16r algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if defined(__SHA__)
#include <openssl/sha.h>
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
static __thread uint64_t* x21s_4way_matrix;
union _x21s_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_echo echo;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(__SHA__)
SHA256_CTX sha256;
#else
sha256_4way_context sha256;
#endif
};
typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
void x21s_4way_hash( void* output, const void* input )
{
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x21s_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
// Input data is both 64 bit interleaved (input)
// and deinterleaved in inp0-3.
// If First function uses 64 bit data it is not required to interleave inp
// first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
// All other functions assume data is deinterleaved in hash0-3
// All functions must exit with data deinterleaved in hash0-3.
// Alias in0-3 points to either inp0-3 or hash0-3 according to
// its hashOrder position. Size is also set accordingly.
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_4way_init( &ctx.blake );
if ( i == 0 )
blake512_4way( &ctx.blake, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
break;
case SKEIN:
skein512_4way_init( &ctx.skein );
if ( i == 0 )
skein512_4way( &ctx.skein, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
if ( i == 0 )
jh512_4way( &ctx.jh, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
if ( i == 0 )
keccak512_4way( &ctx.keccak, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
intrlv_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
break;
case SHA_512:
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
size = 64;
}
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash3 );
LYRA2REV2( x21s_4way_matrix, (void*) hash0, 32, (const void*) hash0, 32,
(const void*) hash0, 32, 1, 4, 4 );
LYRA2REV2( x21s_4way_matrix, (void*) hash1, 32, (const void*) hash1, 32,
(const void*) hash1, 32, 1, 4, 4 );
LYRA2REV2( x21s_4way_matrix, (void*) hash2, 32, (const void*) hash2, 32,
(const void*) hash2, 32, 1, 4, 4 );
LYRA2REV2( x21s_4way_matrix, (void*) hash3, 32, (const void*) hash3, 32,
(const void*) hash3, 32, 1, 4, 4 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
sph_gost512_close( &ctx.gost, (void*) hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
sph_gost512_close( &ctx.gost, (void*) hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
sph_gost512_close( &ctx.gost, (void*) hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
sph_gost512_close( &ctx.gost, (void*) hash3 );
#if defined(__SHA__)
SHA256_Init( &ctx.sha256 );
SHA256_Update( &ctx.sha256, hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx.sha256 );
SHA256_Init( &ctx.sha256 );
SHA256_Update( &ctx.sha256, hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx.sha256 );
SHA256_Init( &ctx.sha256 );
SHA256_Update( &ctx.sha256, hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx.sha256 );
SHA256_Init( &ctx.sha256 );
SHA256_Update( &ctx.sha256, hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx.sha256 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
#else
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way_init( &ctx.sha256 );
sha256_4way( &ctx.sha256, vhash, 64 );
sha256_4way_close( &ctx.sha256, vhash );
dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );
#endif
}
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x21s_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return 0;
}
bool x21s_4way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_4way_matrix = _mm_malloc( size, 64 );
return x21s_4way_matrix;
}
#endif

263
algo/x16/x21s.c Normal file
View File

@@ -0,0 +1,263 @@
/**
* x16r algo implementation
*
* Implementation by tpruvot@github Jan 2018
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include <openssl/sha.h>
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
static __thread uint64_t* x21s_matrix;
union _x21s_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
SHA256_CTX sha256;
};
typedef union _x21s_context_overlay x21s_context_overlay;
void x21s_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
x21s_context_overlay ctx;
void *in = (void*) input;
int size = 80;
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, in, size );
sph_skein512_close( &ctx.skein, hash );
break;
case JH:
sph_jh512_init( &ctx.jh );
sph_jh512(&ctx.jh, in, size );
sph_jh512_close(&ctx.jh, hash );
break;
case KECCAK:
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, in, size );
sph_keccak512_close( &ctx.keccak, hash );
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in, size );
sph_shavite512_close( &ctx.shavite, hash );
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, in, size );
sph_hamsi512_close( &ctx.hamsi, hash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in, size );
sph_fugue512_close( &ctx.fugue, hash );
break;
case SHABAL:
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, in, size );
sph_shabal512_close( &ctx.shabal, hash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in, size );
sph_whirlpool_close( &ctx.whirlpool, hash );
break;
case SHA_512:
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, in, size );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
break;
}
in = (void*) hash;
size = 64;
}
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, (const void*) hash, 64) ;
sph_haval256_5_close( &ctx.haval, hash );
sph_tiger_init( &ctx.tiger );
sph_tiger ( &ctx.tiger, (const void*) hash, 64 );
sph_tiger_close( &ctx.tiger, (void*) hash );
LYRA2REV2( x21s_matrix, (void*) hash, 32, (const void*) hash, 32,
(const void*) hash, 32, 1, 4, 4);
sph_gost512_init( &ctx.gost );
sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
sph_gost512_close( &ctx.gost, (void*) hash );
SHA256_Init( &ctx.sha256 );
SHA256_Update( &ctx.sha256, hash, 64 );
SHA256_Final( (unsigned char*)hash, &ctx.sha256 );
memcpy( output, hash, 32 );
}
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do
{
be32enc( &endiandata[19], nonce );
x21s_hash( hash32, endiandata );
if ( hash32[7] <= Htarg )
if (fulltest( hash32, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool x21s_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
x21s_matrix = _mm_malloc( size, 64 );
return x21s_matrix;
}

View File

@@ -23,7 +23,7 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
union _sonoa_4way_context_overlay
{
@@ -69,7 +69,7 @@ void sonoa_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -80,7 +80,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -134,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -151,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -203,7 +203,7 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
@@ -215,7 +215,7 @@ void sonoa_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -226,7 +226,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -278,13 +278,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -300,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 4
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -317,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -369,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -402,7 +402,7 @@ void sonoa_4way_hash( void *state, const void *input )
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -449,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -501,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -545,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 6
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -562,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -614,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -656,13 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -679,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 7
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -696,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -748,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -790,7 +790,7 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
@@ -806,7 +806,7 @@ void sonoa_4way_hash( void *state, const void *input )
int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
@@ -816,7 +816,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,

Some files were not shown because too many files have changed in this diff Show More