Compare commits

...

4 Commits

Author SHA1 Message Date
Jay D Dee
eaa4bd8152 v3.8.8 2018-04-23 12:47:43 -04:00
Jay D Dee
9edc650042 v3.8.7.2 2018-04-11 13:44:26 -04:00
Jay D Dee
218cef337a v3.8.7.1 2018-04-10 21:49:06 -04:00
Jay D Dee
9ffce7bdb7 v3.8.7 2018-04-09 19:14:38 -04:00
25 changed files with 1995 additions and 1145 deletions

View File

@@ -29,3 +29,5 @@ Wolf0
Optiminer
Jay D Dee
xcouiz@gmail.com

142
README.md
View File

@@ -45,82 +45,84 @@ MacOS, OSx and Android are not supported.
Supported Algorithms
--------------------
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d-crds Credits (CRDS)
argon2d-dyn Dynamic (DYN)
axiom Shabal-256 MemoHash
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
axiom Shabal-256 MemoHash
bastion
blake Blake-256 (SFR)
blakecoin blake256r8
blake2s Blake-2 S
bmw BMW 256
c11 Chaincoin
cryptolight Cryptonight-light
cryptonight cryptonote, Monero (XMR)
blake Blake-256 (SFR)
blakecoin blake256r8
blake2s Blake-2 S
bmw BMW 256
c11 Chaincoin
cryptolight Cryptonight-light
cryptonight
cryptonightv7 Monero (XMR)
decred
deep Deepcoin (DCN)
dmd-gr Diamond-Groestl
drop Dropcoin
fresh Fresh
groestl Groestl coin
heavy Heavy
hmq1725 Espers
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2h Hppcoin
lyra2re lyra2
lyra2rev2 lyra2v2, Vertcoin
lyra2z Zcoin (XZC)
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
m7m Magi (XMG)
myr-gr Myriad-Groestl
neoscrypt NeoScrypt(128, 2, 1)
nist5 Nist5
pentablake Pentablake
phi1612 phi, LUX coin
pluck Pluck:128 (Supcoin)
polytimos Ninja
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
scrypt:N scrypt(N, 1, 1)
deep Deepcoin (DCN)
dmd-gr Diamond-Groestl
drop Dropcoin
fresh Fresh
groestl Groestl coin
heavy Heavy
hmq1725 Espers
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2h Hppcoin
lyra2re lyra2
lyra2rev2 lyra2v2, Vertcoin
lyra2z Zcoin (XZC)
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
m7m Magi (XMG)
myr-gr Myriad-Groestl
neoscrypt NeoScrypt(128, 2, 1)
nist5 Nist5
pentablake Pentablake
phi1612 phi, LUX coin
pluck Pluck:128 (Supcoin)
polytimos Ninja
quark Quark
qubit Qubit
scrypt scrypt(1024, 1, 1) (default)
scrypt:N scrypt(N, 1, 1)
scryptjane:nf
sha256d Double SHA-256
sha256t Triple SHA-256, Onecoin (OC)
shavite3 Shavite3
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
timetravel Machinecoin (MAC)
timetravel10 Bitcore
tribus Denarius (DNR)
vanilla blake256r8vnl (VCash)
veltor (VLT)
sha256d Double SHA-256
sha256t Triple SHA-256, Onecoin (OC)
shavite3 Shavite3
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
timetravel Machinecoin (MAC)
timetravel10 Bitcore
tribus Denarius (DNR)
vanilla blake256r8vnl (VCash)
veltor (VLT)
whirlpool
whirlpoolx
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin (RVN)
x16s pigeoncoin (PGN)
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin (RVN)
x16s pigeoncoin (PGN)
x17
xevan Bitsend (BSD)
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN)
yescryptr32 WAVI
zr5 Ziftr
xevan Bitsend (BSD)
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN)
yescryptr32 WAVI
zr5 Ziftr
Errata
------

View File

@@ -21,14 +21,16 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
Exe name Compile flags Arch name
cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
Exe name Compile flags Arch name
cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-march=westmere" Westmere, Sandy-Ivybridge
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
If you like this software feel free to donate:

View File

@@ -81,7 +81,7 @@ cd cpuminer-opt-x.y.z
Run ./build.sh to build on Linux or execute the following commands.
./autogen.sh
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make
Additional optional compile flags, add the following to CFLAGS to activate:
@@ -160,6 +160,29 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.8
Added cryptonightv7 for Monero.
v3.8.7.2
Fixed argon2d-dyn regression in v3.8.7.1.
Changed compile options for aes-sse42 Windows build to -march=westmere
v3.8.7.1
Fixed argon2d-uis low difficulty rejects.
Fixed argon2d aliases.
v3.8.7
Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
The old names are recognized as aliases.
AVX512 is now supported for argon2d algos, Linux only.
AVX is no longer a reported feature and an AVX Windows binary is no longer
provided. Use AES-SSE42 build instead.
v3.8.6.1
Faster argon2d* AVX2.

View File

@@ -157,81 +157,83 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo)
{
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
case ALGO_ANIME: register_anime_algo ( gate ); break;
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_ARGON2DCRDS: register_argon2d_crds_algo( gate ); break;
case ALGO_ARGON2DDYN: register_argon2d_dyn_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break;
case ALGO_BLAKE: register_blake_algo ( gate ); break;
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
case ALGO_ANIME: register_anime_algo ( gate ); break;
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break;
case ALGO_BLAKE: register_blake_algo ( gate ); break;
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
case ALGO_C11: register_c11_algo ( gate ); break;
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
case ALGO_DECRED: register_decred_algo ( gate ); break;
case ALGO_DEEP: register_deep_algo ( gate ); break;
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
case ALGO_DROP: register_drop_algo ( gate ); break;
case ALGO_FRESH: register_fresh_algo ( gate ); break;
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
case ALGO_HODL: register_hodl_algo ( gate ); break;
case ALGO_JHA: register_jha_algo ( gate ); break;
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
case ALGO_LBRY: register_lbry_algo ( gate ); break;
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
case ALGO_M7M: register_m7m_algo ( gate ); break;
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
case ALGO_NIST5: register_nist5_algo ( gate ); break;
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
case ALGO_QUARK: register_quark_algo ( gate ); break;
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
case ALGO_SKEIN: register_skein_algo ( gate ); break;
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
case ALGO_X15: register_x15_algo ( gate ); break;
case ALGO_X16R: register_x16r_algo ( gate ); break;
case ALGO_X16S: register_x16s_algo ( gate ); break;
case ALGO_X17: register_x17_algo ( gate ); break;
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
case ALGO_C11: register_c11_algo ( gate ); break;
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
case ALGO_DECRED: register_decred_algo ( gate ); break;
case ALGO_DEEP: register_deep_algo ( gate ); break;
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
case ALGO_DROP: register_drop_algo ( gate ); break;
case ALGO_FRESH: register_fresh_algo ( gate ); break;
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
case ALGO_HODL: register_hodl_algo ( gate ); break;
case ALGO_JHA: register_jha_algo ( gate ); break;
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
case ALGO_LBRY: register_lbry_algo ( gate ); break;
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
case ALGO_M7M: register_m7m_algo ( gate ); break;
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
case ALGO_NIST5: register_nist5_algo ( gate ); break;
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
case ALGO_QUARK: register_quark_algo ( gate ); break;
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
case ALGO_SKEIN: register_skein_algo ( gate ); break;
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
case ALGO_X15: register_x15_algo ( gate ); break;
case ALGO_X16R: register_x16r_algo ( gate ); break;
case ALGO_X16S: register_x16s_algo ( gate ); break;
case ALGO_X17: register_x17_algo ( gate ); break;
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
return false;
@@ -288,6 +290,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
const char* const algo_alias_map[][2] =
{
// alias proper
{ "argon2d-crds", "argon2d250" },
{ "argon2d-dyn", "argon2d500" },
{ "argon2d-uis", "argon2d4096" },
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },
{ "blake256r8", "blakecoin" },

View File

@@ -2,6 +2,8 @@
#include <stdbool.h>
#include <stdint.h>
#include "miner.h"
#include "avxdefs.h"
#include "interleave.h"
/////////////////////////////
////
@@ -91,6 +93,7 @@ typedef uint32_t set_t;
#define AVX_OPT 8
#define AVX2_OPT 0x10
#define SHA_OPT 0x20
#define AVX512_OPT 0x40
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }

View File

@@ -28,6 +28,7 @@ void argon2d_crds_hash( void *output, const void *input )
context.lanes = 4; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 1; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
@@ -70,7 +71,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_argon2d_crds;
gate->hash = (void*)&argon2d_crds_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
}
// Dynamic
@@ -96,6 +98,7 @@ void argon2d_dyn_hash( void *output, const void *input )
context.lanes = 8; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 2; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
@@ -138,6 +141,58 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_argon2d_dyn;
gate->hash = (void*)&argon2d_dyn_hash;
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
}
// Unitus
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t t_cost = 1; // 1 iteration
uint32_t m_cost = 4096; // use 4MB
uint32_t parallelism = 1; // 1 thread, 2 lanes
for ( int i = 0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
do {
be32enc( &endiandata[19], n );
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
int64_t get_max64_0x1ff() { return 0x1ff; }
bool register_argon2d4096_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d4096;
gate->set_target = (void*)&scrypt_set_target;
gate->get_max64 = (void*)&get_max64_0x1ff;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
}

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
// Credits
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d_crds_algo( algo_gate_t* gate );
void argon2d_crds_hash( void *state, const void *input );
@@ -12,7 +12,7 @@ void argon2d_crds_hash( void *state, const void *input );
int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
// Dynamic
// Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d_dyn_algo( algo_gate_t* gate );
void argon2d_dyn_hash( void *state, const void *input );
@@ -21,5 +21,11 @@ int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
// Unitus: version = 0x13, m_cost = 4096.
bool register_argon2d4096_algo( algo_gate_t* gate );
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -180,60 +180,65 @@ int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, const size_t hashlen,
char *encoded, const size_t encodedlen) {
char *encoded, const size_t encodedlen,
const uint32_t version) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
NULL, hashlen, encoded, encodedlen, Argon2_i,
ARGON2_VERSION_NUMBER);
version );
}
int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash, const size_t hashlen) {
const size_t saltlen, void *hash, const size_t hashlen,
const uint32_t version ) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
hash, hashlen, NULL, 0, Argon2_i, ARGON2_VERSION_NUMBER);
hash, hashlen, NULL, 0, Argon2_i, version );
}
int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, const size_t hashlen,
char *encoded, const size_t encodedlen) {
char *encoded, const size_t encodedlen,
const uint32_t version ) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
NULL, hashlen, encoded, encodedlen, Argon2_d,
ARGON2_VERSION_NUMBER);
version );
}
int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash, const size_t hashlen) {
const size_t saltlen, void *hash, const size_t hashlen,
const uint32_t version ) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
hash, hashlen, NULL, 0, Argon2_d, ARGON2_VERSION_NUMBER);
hash, hashlen, NULL, 0, Argon2_d, version );
}
int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, const size_t hashlen,
char *encoded, const size_t encodedlen) {
char *encoded, const size_t encodedlen,
const uint32_t version ) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
NULL, hashlen, encoded, encodedlen, Argon2_id,
ARGON2_VERSION_NUMBER);
version);
}
int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash, const size_t hashlen) {
const size_t saltlen, void *hash, const size_t hashlen,
const uint32_t version ) {
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
hash, hashlen, NULL, 0, Argon2_id,
ARGON2_VERSION_NUMBER);
hash, hashlen, NULL, 0, Argon2_id, version );
}
static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
@@ -443,10 +448,11 @@ const char *argon2_error_message(int error_code) {
return "Unknown error code";
}
}
/*
size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
uint32_t saltlen, uint32_t hashlen, argon2_type type) {
return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
}
*/

View File

@@ -225,11 +225,8 @@ typedef enum Argon2_type {
} argon2_type;
/* Version of the algorithm */
typedef enum Argon2_version {
ARGON2_VERSION_10 = 0x10,
ARGON2_VERSION_13 = 0x13,
ARGON2_VERSION_NUMBER = ARGON2_VERSION_10
} argon2_version;
#define ARGON2_VERSION_10 0x10
#define ARGON2_VERSION_13 0x13
/*
* Function that gives the string representation of an argon2_type.
@@ -267,7 +264,8 @@ ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
const void *pwd, const size_t pwdlen,
const void *salt, const size_t saltlen,
const size_t hashlen, char *encoded,
const size_t encodedlen);
const size_t encodedlen,
const uint32_t version );
/**
* Hashes a password with Argon2i, producing a raw hash at @hash
@@ -287,7 +285,8 @@ ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash,
const size_t hashlen);
const size_t hashlen,
const uint32_t version );
ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
const uint32_t m_cost,
@@ -295,13 +294,15 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
const void *pwd, const size_t pwdlen,
const void *salt, const size_t saltlen,
const size_t hashlen, char *encoded,
const size_t encodedlen);
const size_t encodedlen,
const uint32_t version );
ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash,
const size_t hashlen);
const size_t hashlen,
const uint32_t version );
ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
const uint32_t m_cost,
@@ -309,14 +310,16 @@ ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
const void *pwd, const size_t pwdlen,
const void *salt, const size_t saltlen,
const size_t hashlen, char *encoded,
const size_t encodedlen);
const size_t encodedlen,
const uint32_t version );
ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
const uint32_t m_cost,
const uint32_t parallelism, const void *pwd,
const size_t pwdlen, const void *salt,
const size_t saltlen, void *hash,
const size_t hashlen);
const size_t hashlen,
const uint32_t version );
/* generic function underlying the above ones */
ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
@@ -325,7 +328,7 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
const size_t saltlen, void *hash,
const size_t hashlen, char *encoded,
const size_t encodedlen, argon2_type type,
const uint32_t version);
const uint32_t version );
/**
* Verifies a password against an encoded string

View File

@@ -544,7 +544,8 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
store32(&value, context->t_cost);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, ARGON2_VERSION_NUMBER);
// store32(&value, ARGON2_VERSION_NUMBER);
store32(&value, context->version);
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
store32(&value, (uint32_t)type);

View File

@@ -345,15 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
// if (ARGON2_VERSION_10 == instance->version) {
// /* version 1.2.1 and earlier: overwrite, not XOR */
// fill_block(state, ref_block, curr_block, 0);
// } else {
// if(0 == position.pass) {
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(state, ref_block, curr_block, 0);
} else {
if(0 == position.pass) {
fill_block(state, ref_block, curr_block, 0);
// } else {
// fill_block(state, ref_block, curr_block, 1);
// }
// }
} else {
fill_block(state, ref_block, curr_block, 1);
}
}
}
}

View File

@@ -325,7 +325,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
#ifndef NO_AES_NI
#if defined(__AES__)
do {
*nonceptr = ++n;
cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);

View File

@@ -1,14 +1,11 @@
#if defined(__AES__)
#include <x86intrin.h>
#include <memory.h>
#include "cryptonight.h"
#include "miner.h"
#include "crypto/c_keccak.h"
#include <immintrin.h>
//#include "avxdefs.h"
void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
void that_fucking_loop(uint8_t a[16], uint8_t b[16], uint8_t *long_state);
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
@@ -25,7 +22,6 @@ static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
{
#ifndef NO_AES_NI
__m128i tmp2, tmp4;
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
@@ -37,14 +33,12 @@ static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
#endif
}
// Special thanks to Intel for helping me
// with ExpandAESKey256() and its subroutines
static inline void ExpandAESKey256(char *keybuf)
{
#ifndef NO_AES_NI
__m128i tmp1, tmp2, tmp3, *keys;
keys = (__m128i *)keybuf;
@@ -91,7 +85,6 @@ static inline void ExpandAESKey256(char *keybuf)
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[14] = tmp1;
#endif
}
// align to 64 byte cache line
@@ -109,13 +102,19 @@ static __thread cryptonight_ctx ctx;
void cryptonight_hash_aes( void *restrict output, const void *input, int len )
{
#ifndef NO_AES_NI
uint8_t ExpandedKey[256] __attribute__((aligned(64)));
__m128i *longoutput, *expkey, *xmminput;
size_t i, j;
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
if ( cryptonightV7 && len < 43 )
return;
const uint64_t tweak = cryptonightV7
? *((const uint64_t*) (((const uint8_t*)input) + 35))
^ ctx.state.hs.w[24] : 0;
memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
ExpandAESKey256( ExpandedKey );
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
@@ -214,7 +213,15 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
_mm_store_si128( (__m128i*)c, c_x );
b_x = _mm_xor_si128( b_x, c_x );
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
_mm_store_si128( lsa, b_x );
_mm_store_si128( lsa, b_x );
if ( cryptonightV7 )
{
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
}
b[0] = nextblock[0];
b[1] = nextblock[1];
@@ -227,10 +234,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
: "cc" );
b_x = c_x;
nextblock[0] = a[0] + hi;
nextblock[1] = a[1] + lo;
a[0] = b[0] ^ nextblock[0];
a[1] = b[1] ^ nextblock[1];
a[0] += hi;
a[1] += lo;
nextblock[0] = a[0];
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
a[0] ^= b[0];
a[1] ^= b[1];
lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
a_x = _mm_load_si128( (__m128i*)a );
c_x = _mm_load_si128( lsa );
@@ -241,6 +252,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
b_x = _mm_xor_si128( b_x, c_x );
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
_mm_store_si128( lsa, b_x );
if ( cryptonightV7 )
{
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
}
b[0] = nextblock[0];
b[1] = nextblock[1];
@@ -251,8 +270,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
"rm" ( b[0] )
: "cc" );
nextblock[0] = a[0] + hi;
nextblock[1] = a[1] + lo;
a[0] += hi;
a[1] += lo;
nextblock[0] = a[0];
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
a[0] ^= b[0];
a[1] ^= b[1];
memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
ExpandAESKey256( ExpandedKey );
@@ -330,5 +353,5 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
#endif
}
#endif

View File

@@ -7,11 +7,11 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif
#else
#include "crypto/c_groestl.h"
#endif
#include "crypto/c_blake256.h"
#include "crypto/c_jh.h"
#include "crypto/c_skein.h"
@@ -30,12 +30,12 @@ void do_blake_hash(const void* input, size_t len, char* output) {
}
void do_groestl_hash(const void* input, size_t len, char* output) {
#ifdef NO_AES_NI
groestl(input, len * 8, (uint8_t*)output);
#else
#if defined(__AES__)
hashState_groestl256 ctx;
init_groestl256( &ctx, 32 );
update_and_final_groestl256( &ctx, output, input, len * 8 );
#else
groestl(input, len * 8, (uint8_t*)output);
#endif
}
@@ -52,23 +52,24 @@ void (* const extra_hashes[4])( const void *, size_t, char *) =
void cryptonight_hash( void *restrict output, const void *input, int len )
{
#ifdef NO_AES_NI
cryptonight_hash_ctx ( output, input, len );
#else
#if defined(__AES__)
cryptonight_hash_aes( output, input, len );
#else
cryptonight_hash_ctx ( output, input, len );
#endif
}
void cryptonight_hash_suw( void *restrict output, const void *input )
{
#ifdef NO_AES_NI
cryptonight_hash_ctx ( output, input, 76 );
#else
#if defined(__AES__)
cryptonight_hash_aes( output, input, 76 );
#else
cryptonight_hash_ctx ( output, input, 76 );
#endif
}
bool cryptonightV7 = false;
int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
@@ -80,6 +81,11 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = n + 1;
const uint32_t Htarg = ptarget[7];
uint32_t hash[32 / 4] __attribute__((aligned(32)));
if ( ( cryptonightV7 && ( *(uint8_t*)pdata < 7 ) )
|| ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
do
{
*nonceptr = ++n;
@@ -97,6 +103,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
bool register_cryptonight_algo( algo_gate_t* gate )
{
cryptonightV7 = false;
register_json_rpc2( gate );
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_cryptonight;
@@ -106,3 +113,15 @@ bool register_cryptonight_algo( algo_gate_t* gate )
return true;
};
bool register_cryptonightv7_algo( algo_gate_t* gate )
{
cryptonightV7 = true;
register_json_rpc2( gate );
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_cryptonight;
gate->hash = (void*)&cryptonight_hash;
gate->hash_suw = (void*)&cryptonight_hash_suw;
gate->get_max64 = (void*)&get_max64_0x40LL;
return true;
};

View File

@@ -20,8 +20,8 @@
#include "crypto/c_jh.h"
#include "crypto/c_skein.h"
#include "crypto/int-util.h"
#include "crypto/hash-ops.h"
//#include "cryptonight.h"
//#include "crypto/hash-ops.h"
#include "cryptonight.h"
#if USE_INT128
@@ -51,6 +51,7 @@ typedef __uint128_t uint128_t;
#define INIT_SIZE_BLK 8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
/*
#pragma pack(push, 1)
union cn_slow_hash_state {
union hash_state hs;
@@ -78,6 +79,7 @@ static void do_skein_hash(const void* input, size_t len, char* output) {
int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
assert(likely(SKEIN_SUCCESS == r));
}
*/
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
@@ -120,9 +122,11 @@ static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* pro
extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
#endif
/*
static void (* const extra_hashes[4])(const void *, size_t, char *) = {
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
};
*/
static inline size_t e2i(const uint8_t* a) {
#if !LITE
@@ -132,14 +136,16 @@ static inline size_t e2i(const uint8_t* a) {
#endif
}
static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst,
const uint64_t tweak )
{
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
}
static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
@@ -174,8 +180,16 @@ static __thread cryptonight_ctx ctx;
void cryptonight_hash_ctx(void* output, const void* input, int len)
{
hash_process(&ctx.state.hs, (const uint8_t*) input, len);
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
// hash_process(&ctx.state.hs, (const uint8_t*) input, len);
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
if ( cryptonightV7 && len < 43 )
return;
const uint64_t tweak = cryptonightV7
? *((const uint64_t*) (((const uint8_t*)input) + 35))
^ ctx.state.hs.w[24] : 0;
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
__builtin_prefetch( ctx.text, 0, 3 );
__builtin_prefetch( ctx.text + 64, 0, 3 );
@@ -211,23 +225,44 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
for (i = 0; likely(i < ITER / 4); ++i) {
/* Dependency chain: address -> read value ------+
* written value <-+ hard function (AES or MUL) <+
* next address <-+
*/
/* Iteration 1 */
j = e2i(ctx.a);
aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
/* Iteration 2 */
mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)]);
/* Iteration 3 */
j = e2i(ctx.a);
aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
/* Iteration 4 */
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
for (i = 0; likely(i < ITER / 4); ++i)
{
/* Dependency chain: address -> read value ------+
* written value <-+ hard function (AES or MUL) <+
* next address <-+
*/
/* Iteration 1 */
j = e2i(ctx.a);
aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
if ( cryptonightV7 )
{
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
const uint8_t tmp = lsa[11];
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
}
/* Iteration 2 */
mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
/* Iteration 3 */
j = e2i(ctx.a);
aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
if ( cryptonightV7 )
{
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
const uint8_t tmp = lsa[11];
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
}
/* Iteration 4 */
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
}
__builtin_prefetch( ctx.text, 0, 3 );
@@ -266,7 +301,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
}
memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
hash_permutation(&ctx.state.hs);
// hash_permutation(&ctx.state.hs);
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
/*memcpy(hash, &state, 32);*/
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
oaes_free((OAES_CTX **) &ctx.aes_ctx);

View File

@@ -45,5 +45,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
void cryptonight_hash_aes( void *restrict output, const void *input, int len );
extern bool cryptonightV7;
#endif

766
avxdefs.h
View File

@@ -1,5 +1,5 @@
#ifndef AVXDEFS_H__
#define AVXDEFS_H__
#define AVXDEFS_H__ 1
// Some tools to help using SIMD vectors.
//
@@ -1034,6 +1034,11 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
//
// Pseudo constants.
// _mm512_setzero_si512 uses xor instruction. If needed frequently
// in a function it's better to define a register variable (const?)
// initialized to zero.
// It isn't clear to me yet how set or set1 work.
#define m512_zero _mm512_setzero_si512()
#define m512_one_512 _mm512_set_epi64x( 0ULL, 0ULL, 0ULL, 0ULL, \
0ULL, 0ULL, 0ULL, 1ULL )
@@ -1058,6 +1063,21 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
//
// Pointer casting
// p = any aligned pointer
// i = scaled array index
// o = scaled address offset
// returns p as pointer to vector
#define castp_m512i(p) ((__m512i*)(p))
// returns *p as vector value
#define cast_m512i(p) (*((__m512i*)(p)))
// returns p[i] as vector value
#define casti_m512i(p,i) (((__m512i*)(p))[(i)])
// returns p+o as pointer to vector
#define casto_m512i(p,o) (((__m512i*)(p))+(o))
//
// Memory functions
@@ -1237,746 +1257,4 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
#endif // AVX512F
// Paired functions for interleaving and deinterleaving data for vector
// processing.
// Size is specfied in bits regardless of vector size to avoid pointer
// arithmetic confusion with different size vectors and be consistent with
// the function's name.
//
// Each function has 2 implementations, an optimized version that uses
// vector indexing and a slower version that uses pointers. The optimized
// version can only be used with 64 bit elements and only supports sizes
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
//
// NOTE: Contrary to GCC documentation, accessing vector elements using array
// indexes only works with 64 bit elements.
// Interleaving and deinterleaving of vectors of 32 bit elements
// must use the slower implementations that don't use vector indexing.
//
// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
// Interleave source args and deinterleave destination args are not required
// to be contiguous in memory but it's more efficient if they are.
// Interleave source agrs may be the same actual arg repeated.
// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
// destination buffers be defined with padding up to 768 bits for overrun
// space. Although overrun space use is non destructive it should not overlay
// useful data and should be ignored by the caller.
// SSE2 AVX
// interleave 4 arrays of 32 bit elements for 128 bit processing
// bit_len must be 256, 512 or 640 bits.
static inline void mm_interleave_4x32( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, int bit_len )
{
uint32_t *s0 = (uint32_t*)src0;
uint32_t *s1 = (uint32_t*)src1;
uint32_t *s2 = (uint32_t*)src2;
uint32_t *s3 = (uint32_t*)src3;
__m128i* d = (__m128i*)dst;
d[0] = _mm_set_epi32( s3[ 0], s2[ 0], s1[ 0], s0[ 0] );
d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
d[7] = _mm_set_epi32( s3[ 7], s2[ 7], s1[ 7], s0[ 7] );
if ( bit_len <= 256 ) return;
d[ 8] = _mm_set_epi32( s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
d[ 9] = _mm_set_epi32( s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
d[10] = _mm_set_epi32( s3[10], s2[10], s1[10], s0[10] );
d[11] = _mm_set_epi32( s3[11], s2[11], s1[11], s0[11] );
d[12] = _mm_set_epi32( s3[12], s2[12], s1[12], s0[12] );
d[13] = _mm_set_epi32( s3[13], s2[13], s1[13], s0[13] );
d[14] = _mm_set_epi32( s3[14], s2[14], s1[14], s0[14] );
d[15] = _mm_set_epi32( s3[15], s2[15], s1[15], s0[15] );
if ( bit_len <= 512 ) return;
d[16] = _mm_set_epi32( s3[16], s2[16], s1[16], s0[16] );
d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
if ( bit_len <= 640 ) return;
d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
// bit_len == 1024
}
// bit_len must be multiple of 32
static inline void mm_interleave_4x32x( void *dst, void *src0, void *src1,
void *src2, void *src3, int bit_len )
{
uint32_t *d = (uint32_t*)dst;
uint32_t *s0 = (uint32_t*)src0;
uint32_t *s1 = (uint32_t*)src1;
uint32_t *s2 = (uint32_t*)src2;
uint32_t *s3 = (uint32_t*)src3;
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
{
*d = *(s0+i);
*(d+1) = *(s1+i);
*(d+2) = *(s2+i);
*(d+3) = *(s3+i);
}
}
static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
uint32_t *s = (uint32_t*)src;
__m128i* d0 = (__m128i*)dst0;
__m128i* d1 = (__m128i*)dst1;
__m128i* d2 = (__m128i*)dst2;
__m128i* d3 = (__m128i*)dst3;
d0[0] = _mm_set_epi32( s[12], s[ 8], s[ 4], s[ 0] );
d1[0] = _mm_set_epi32( s[13], s[ 9], s[ 5], s[ 1] );
d2[0] = _mm_set_epi32( s[14], s[10], s[ 6], s[ 2] );
d3[0] = _mm_set_epi32( s[15], s[11], s[ 7], s[ 3] );
d0[1] = _mm_set_epi32( s[28], s[24], s[20], s[16] );
d1[1] = _mm_set_epi32( s[29], s[25], s[21], s[17] );
d2[1] = _mm_set_epi32( s[30], s[26], s[22], s[18] );
d3[1] = _mm_set_epi32( s[31], s[27], s[23], s[19] );
if ( bit_len <= 256 ) return;
d0[2] = _mm_set_epi32( s[44], s[40], s[36], s[32] );
d1[2] = _mm_set_epi32( s[45], s[41], s[37], s[33] );
d2[2] = _mm_set_epi32( s[46], s[42], s[38], s[34] );
d3[2] = _mm_set_epi32( s[47], s[43], s[39], s[35] );
d0[3] = _mm_set_epi32( s[60], s[56], s[52], s[48] );
d1[3] = _mm_set_epi32( s[61], s[57], s[53], s[49] );
d2[3] = _mm_set_epi32( s[62], s[58], s[54], s[50] );
d3[3] = _mm_set_epi32( s[63], s[59], s[55], s[51] );
if ( bit_len <= 512 ) return;
d0[4] = _mm_set_epi32( s[76], s[72], s[68], s[64] );
d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
if ( bit_len <= 640 ) return;
d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
// bit_len == 1024
}
// deinterleave 4 arrays into individual buffers for scalarm processing
// bit_len must be multiple of 32
static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
uint32_t *s = (uint32_t*)src;
uint32_t *d0 = (uint32_t*)dst0;
uint32_t *d1 = (uint32_t*)dst1;
uint32_t *d2 = (uint32_t*)dst2;
uint32_t *d3 = (uint32_t*)dst3;
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
{
*(d0+i) = *s;
*(d1+i) = *(s+1);
*(d2+i) = *(s+2);
*(d3+i) = *(s+3);
}
}
#if defined (__AVX2__)
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
static inline void mm256_interleave_4x64( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
uint64_t *s2 = (uint64_t*)src2;
uint64_t *s3 = (uint64_t*)src3;
d[0] = _mm256_set_epi64x( s3[0], s2[0], s1[0], s0[0] );
d[1] = _mm256_set_epi64x( s3[1], s2[1], s1[1], s0[1] );
d[2] = _mm256_set_epi64x( s3[2], s2[2], s1[2], s0[2] );
d[3] = _mm256_set_epi64x( s3[3], s2[3], s1[3], s0[3] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi64x( s3[4], s2[4], s1[4], s0[4] );
d[5] = _mm256_set_epi64x( s3[5], s2[5], s1[5], s0[5] );
d[6] = _mm256_set_epi64x( s3[6], s2[6], s1[6], s0[6] );
d[7] = _mm256_set_epi64x( s3[7], s2[7], s1[7], s0[7] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
// bit_len == 1024
}
// Slower version
// bit_len must be multiple of 64
static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
void *src2, void *src3, int bit_len )
{
uint64_t *d = (uint64_t*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
uint64_t *s2 = (uint64_t*)src2;
uint64_t *s3 = (uint64_t*)src3;
for ( int i = 0; i < bit_len>>6; i++, d += 4 )
{
*d = *(s0+i);
*(d+1) = *(s1+i);
*(d+2) = *(s2+i);
*(d+3) = *(s3+i);
}
}
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
__m256i* d2 = (__m256i*)dst2;
__m256i* d3 = (__m256i*)dst3;
uint64_t* s = (uint64_t*)src;
d0[0] = _mm256_set_epi64x( s[12], s[ 8], s[ 4], s[ 0] );
d1[0] = _mm256_set_epi64x( s[13], s[ 9], s[ 5], s[ 1] );
d2[0] = _mm256_set_epi64x( s[14], s[10], s[ 6], s[ 2] );
d3[0] = _mm256_set_epi64x( s[15], s[11], s[ 7], s[ 3] );
if ( bit_len <= 256 ) return;
d0[1] = _mm256_set_epi64x( s[28], s[24], s[20], s[16] );
d1[1] = _mm256_set_epi64x( s[29], s[25], s[21], s[17] );
d2[1] = _mm256_set_epi64x( s[30], s[26], s[22], s[18] );
d3[1] = _mm256_set_epi64x( s[31], s[27], s[23], s[19] );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
// null change to overrun area
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
return;
}
d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
// bit_len == 1024
}
// Slower version
// bit_len must be multiple 0f 64
static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1,
void *dst2, void *dst3, void *src, int bit_len )
{
uint64_t *s = (uint64_t*)src;
uint64_t *d0 = (uint64_t*)dst0;
uint64_t *d1 = (uint64_t*)dst1;
uint64_t *d2 = (uint64_t*)dst2;
uint64_t *d3 = (uint64_t*)dst3;
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
{
*(d0+i) = *s;
*(d1+i) = *(s+1);
*(d2+i) = *(s+2);
*(d3+i) = *(s+3);
}
}
// Interleave 8 source buffers containing 32 bit data into the destination
// vector
static inline void mm256_interleave_8x32( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, const void *src4,
const void *src5, const void *src6, const void *src7, int bit_len )
{
uint32_t *s0 = (uint32_t*)src0;
uint32_t *s1 = (uint32_t*)src1;
uint32_t *s2 = (uint32_t*)src2;
uint32_t *s3 = (uint32_t*)src3;
uint32_t *s4 = (uint32_t*)src4;
uint32_t *s5 = (uint32_t*)src5;
uint32_t *s6 = (uint32_t*)src6;
uint32_t *s7 = (uint32_t*)src7;
__m256i *d = (__m256i*)dst;
d[ 0] = _mm256_set_epi32( s7[0], s6[0], s5[0], s4[0],
s3[0], s2[0], s1[0], s0[0] );
d[ 1] = _mm256_set_epi32( s7[1], s6[1], s5[1], s4[1],
s3[1], s2[1], s1[1], s0[1] );
d[ 2] = _mm256_set_epi32( s7[2], s6[2], s5[2], s4[2],
s3[2], s2[2], s1[2], s0[2] );
d[ 3] = _mm256_set_epi32( s7[3], s6[3], s5[3], s4[3],
s3[3], s2[3], s1[3], s0[3] );
d[ 4] = _mm256_set_epi32( s7[4], s6[4], s5[4], s4[4],
s3[4], s2[4], s1[4], s0[4] );
d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
s3[5], s2[5], s1[5], s0[5] );
d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
s3[6], s2[6], s1[6], s0[6] );
d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
s3[7], s2[7], s1[7], s0[7] );
if ( bit_len <= 256 ) return;
d[ 8] = _mm256_set_epi32( s7[ 8], s6[ 8], s5[ 8], s4[ 8],
s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
d[ 9] = _mm256_set_epi32( s7[ 9], s6[ 9], s5[ 9], s4[ 9],
s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
d[10] = _mm256_set_epi32( s7[10], s6[10], s5[10], s4[10],
s3[10], s2[10], s1[10], s0[10] );
d[11] = _mm256_set_epi32( s7[11], s6[11], s5[11], s4[11],
s3[11], s2[11], s1[11], s0[11] );
d[12] = _mm256_set_epi32( s7[12], s6[12], s5[12], s4[12],
s3[12], s2[12], s1[12], s0[12] );
d[13] = _mm256_set_epi32( s7[13], s6[13], s5[13], s4[13],
s3[13], s2[13], s1[13], s0[13] );
d[14] = _mm256_set_epi32( s7[14], s6[14], s5[14], s4[14],
s3[14], s2[14], s1[14], s0[14] );
d[15] = _mm256_set_epi32( s7[15], s6[15], s5[15], s4[15],
s3[15], s2[15], s1[15], s0[15] );
if ( bit_len <= 512 ) return;
d[16] = _mm256_set_epi32( s7[16], s6[16], s5[16], s4[16],
s3[16], s2[16], s1[16], s0[16] );
d[17] = _mm256_set_epi32( s7[17], s6[17], s5[17], s4[17],
s3[17], s2[17], s1[17], s0[17] );
d[18] = _mm256_set_epi32( s7[18], s6[18], s5[18], s4[18],
s3[18], s2[18], s1[18], s0[18] );
d[19] = _mm256_set_epi32( s7[19], s6[19], s5[19], s4[19],
s3[19], s2[19], s1[19], s0[19] );
if ( bit_len <= 640 ) return;
d[20] = _mm256_set_epi32( s7[20], s6[20], s5[20], s4[20],
s3[20], s2[20], s1[20], s0[20] );
d[21] = _mm256_set_epi32( s7[21], s6[21], s5[21], s4[21],
s3[21], s2[21], s1[21], s0[21] );
d[22] = _mm256_set_epi32( s7[22], s6[22], s5[22], s4[22],
s3[22], s2[22], s1[22], s0[22] );
d[23] = _mm256_set_epi32( s7[23], s6[23], s5[23], s4[23],
s3[23], s2[23], s1[23], s0[23] );
if ( bit_len <= 768 ) return;
d[24] = _mm256_set_epi32( s7[24], s6[24], s5[24], s4[24],
s3[24], s2[24], s1[24], s0[24] );
d[25] = _mm256_set_epi32( s7[25], s6[25], s5[25], s4[25],
s3[25], s2[25], s1[25], s0[25] );
d[26] = _mm256_set_epi32( s7[26], s6[26], s5[26], s4[26],
s3[26], s2[26], s1[26], s0[26] );
d[27] = _mm256_set_epi32( s7[27], s6[27], s5[27], s4[27],
s3[27], s2[27], s1[27], s0[27] );
d[28] = _mm256_set_epi32( s7[28], s6[28], s5[28], s4[28],
s3[28], s2[28], s1[28], s0[28] );
d[29] = _mm256_set_epi32( s7[29], s6[29], s5[29], s4[29],
s3[29], s2[29], s1[29], s0[29] );
d[30] = _mm256_set_epi32( s7[30], s6[30], s5[30], s4[30],
s3[30], s2[30], s1[30], s0[30] );
d[31] = _mm256_set_epi32( s7[31], s6[31], s5[31], s4[31],
s3[31], s2[31], s1[31], s0[31] );
// bit_len == 1024
}
// Slower but it works with 32 bit data
// bit_len must be multiple of 32
static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
{
uint32_t *d = dst;;
for ( int i = 0; i < bit_len>>5; i++, d += 8 )
{
*d = *(src0+i);
*(d+1) = *(src1+i);
*(d+2) = *(src2+i);
*(d+3) = *(src3+i);
*(d+4) = *(src4+i);
*(d+5) = *(src5+i);
*(d+6) = *(src6+i);
*(d+7) = *(src7+i);
}
}
// Deinterleave 8 buffers of 32 bit data from the source buffer.
static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
const void *src, int bit_len )
{
uint32_t *s = (uint32_t*)src;
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
__m256i* d2 = (__m256i*)dst2;
__m256i* d3 = (__m256i*)dst3;
__m256i* d4 = (__m256i*)dst4;
__m256i* d5 = (__m256i*)dst5;
__m256i* d6 = (__m256i*)dst6;
__m256i* d7 = (__m256i*)dst7;
d0[0] = _mm256_set_epi32( s[ 56], s[ 48], s[ 40], s[ 32],
s[ 24], s[ 16], s[ 8], s[ 0] );
d1[0] = _mm256_set_epi32( s[ 57], s[ 49], s[ 41], s[ 33],
s[ 25], s[ 17], s[ 9], s[ 1] );
d2[0] = _mm256_set_epi32( s[ 58], s[ 50], s[ 42], s[ 34],
s[ 26], s[ 18], s[ 10], s[ 2] );
d3[0] = _mm256_set_epi32( s[ 59], s[ 51], s[ 43], s[ 35],
s[ 27], s[ 19], s[ 11], s[ 3] );
d4[0] = _mm256_set_epi32( s[ 60], s[ 52], s[ 44], s[ 36],
s[ 28], s[ 20], s[ 12], s[ 4] );
d5[0] = _mm256_set_epi32( s[ 61], s[ 53], s[ 45], s[ 37],
s[ 29], s[ 21], s[ 13], s[ 5] );
d6[0] = _mm256_set_epi32( s[ 62], s[ 54], s[ 46], s[ 38],
s[ 30], s[ 22], s[ 14], s[ 6] );
d7[0] = _mm256_set_epi32( s[ 63], s[ 55], s[ 47], s[ 39],
s[ 31], s[ 23], s[ 15], s[ 7] );
if ( bit_len <= 256 ) return;
d0[1] = _mm256_set_epi32( s[120], s[112], s[104], s[ 96],
s[ 88], s[ 80], s[ 72], s[ 64] );
d1[1] = _mm256_set_epi32( s[121], s[113], s[105], s[ 97],
s[ 89], s[ 81], s[ 73], s[ 65] );
d2[1] = _mm256_set_epi32( s[122], s[114], s[106], s[ 98],
s[ 90], s[ 82], s[ 74], s[ 66]);
d3[1] = _mm256_set_epi32( s[123], s[115], s[107], s[ 99],
s[ 91], s[ 83], s[ 75], s[ 67] );
d4[1] = _mm256_set_epi32( s[124], s[116], s[108], s[100],
s[ 92], s[ 84], s[ 76], s[ 68] );
d5[1] = _mm256_set_epi32( s[125], s[117], s[109], s[101],
s[ 93], s[ 85], s[ 77], s[ 69] );
d6[1] = _mm256_set_epi32( s[126], s[118], s[110], s[102],
s[ 94], s[ 86], s[ 78], s[ 70] );
d7[1] = _mm256_set_epi32( s[127], s[119], s[111], s[103],
s[ 95], s[ 87], s[ 79], s[ 71] );
if ( bit_len <= 512 ) return;
// null change for overrun space, vector indexing doesn't work for
// 32 bit data
if ( bit_len <= 640 )
{
uint32_t *d = ((uint32_t*)d0) + 8;
d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[152], s[144], s[136], s[128] );
d = ((uint32_t*)d1) + 8;
d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[153], s[145], s[137], s[129] );
d = ((uint32_t*)d2) + 8;
d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[154], s[146], s[138], s[130]);
d = ((uint32_t*)d3) + 8;
d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[155], s[147], s[139], s[131] );
d = ((uint32_t*)d4) + 8;
d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[156], s[148], s[140], s[132] );
d = ((uint32_t*)d5) + 8;
d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[157], s[149], s[141], s[133] );
d = ((uint32_t*)d6) + 8;
d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[158], s[150], s[142], s[134] );
d = ((uint32_t*)d7) + 8;
d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[159], s[151], s[143], s[135] );
return;
}
d0[2] = _mm256_set_epi32( s[184], s[176], s[168], s[160],
s[152], s[144], s[136], s[128] );
d1[2] = _mm256_set_epi32( s[185], s[177], s[169], s[161],
s[153], s[145], s[137], s[129] );
d2[2] = _mm256_set_epi32( s[186], s[178], s[170], s[162],
s[154], s[146], s[138], s[130] );
d3[2] = _mm256_set_epi32( s[187], s[179], s[171], s[163],
s[155], s[147], s[139], s[131] );
d4[2] = _mm256_set_epi32( s[188], s[180], s[172], s[164],
s[156], s[148], s[140], s[132] );
d5[2] = _mm256_set_epi32( s[189], s[181], s[173], s[165],
s[157], s[149], s[141], s[133] );
d6[2] = _mm256_set_epi32( s[190], s[182], s[174], s[166],
s[158], s[150], s[142], s[134] );
d7[2] = _mm256_set_epi32( s[191], s[183], s[175], s[167],
s[159], s[151], s[143], s[135] );
if ( bit_len <= 768 ) return;
d0[3] = _mm256_set_epi32( s[248], s[240], s[232], s[224],
s[216], s[208], s[200], s[192] );
d1[3] = _mm256_set_epi32( s[249], s[241], s[233], s[225],
s[217], s[209], s[201], s[193] );
d2[3] = _mm256_set_epi32( s[250], s[242], s[234], s[226],
s[218], s[210], s[202], s[194] );
d3[3] = _mm256_set_epi32( s[251], s[243], s[235], s[227],
s[219], s[211], s[203], s[195] );
d4[3] = _mm256_set_epi32( s[252], s[244], s[236], s[228],
s[220], s[212], s[204], s[196] );
d5[3] = _mm256_set_epi32( s[253], s[245], s[237], s[229],
s[221], s[213], s[205], s[197] );
d6[3] = _mm256_set_epi32( s[254], s[246], s[238], s[230],
s[222], s[214], s[206], s[198] );
d7[3] = _mm256_set_epi32( s[255], s[247], s[239], s[231],
s[223], s[215], s[207], s[199] );
// bit_len == 1024
}
// Deinterleave 8 arrays into indivdual buffers for scalar processing
// bit_len must be multiple of 32
static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
{
uint32_t *s = src;
for ( int i = 0; i < bit_len>>5; i++, s += 8 )
{
*(dst0+i) = *( s );
*(dst1+i) = *( s + 1 );
*(dst2+i) = *( s + 2 );
*(dst3+i) = *( s + 3 );
*(dst4+i) = *( s + 4 );
*(dst5+i) = *( s + 5 );
*(dst6+i) = *( s + 6 );
*(dst7+i) = *( s + 7 );
}
}
// Convert from 4x32 AVX interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
// bit_len == 1024
}
// likely of no use.
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
// bit_len must be multiple of 64
// broken
static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
int bit_len )
{
uint32_t *d = (uint32_t*)dst;
uint32_t *s = (uint32_t*)src;
for ( int i = 0; i < bit_len >> 5; i += 8 )
{
*( d + i ) = *( s + i ); // 0 <- 0 8 <- 8
*( d + i + 1 ) = *( s + i + 4 ); // 1 <- 4 9 <- 12
*( d + i + 2 ) = *( s + i + 1 ); // 2 <- 1 10 <- 9
*( d + i + 3 ) = *( s + i + 5 ); // 3 <- 5 11 <- 13
*( d + i + 4 ) = *( s + i + 2 ); // 4 <- 2 12 <- 10
*( d + i + 5 ) = *( s + i + 6 ); // 5 <- 6 13 <- 14
*( d + i + 6 ) = *( s + i + 3 ); // 6 <- 3 14 <- 11
*( d + i + 7 ) = *( s + i + 7 ); // 7 <- 7 15 <- 15
}
}
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
if ( bit_len <= 256 ) return;
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
if ( bit_len <= 512 ) return;
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
// bit_len == 1024
}
static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] );
d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] );
if ( bit_len <= 256 ) return;
d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] );
d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] );
if ( bit_len <= 512 ) return;
d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] );
if ( bit_len <= 640 ) return;
d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] );
d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] );
d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] );
// bit_len == 1024
}
static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src,
int bit_len )
{
uint64_t *s = (uint64_t*)src;
__m256i *d0 = (__m256i*)dst0;
__m256i *d1 = (__m256i*)dst1;
d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] );
d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] );
if ( bit_len <= 256 ) return;
d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] );
d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] );
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] );
return;
}
d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] );
d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] );
d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] );
d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] );
// bit_len == 1024
}
// not used
static inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
{
uint32_t *d = (uint32_t*)dst;
uint32_t *s = (uint32_t*)src;
for ( int i = 0; i < bit_len >> 5; i +=8 )
{
*( d + i ) = *( s + i );
*( d + i + 1 ) = *( s + i + 2 );
*( d + i + 2 ) = *( s + i + 4 );
*( d + i + 3 ) = *( s + i + 6 );
*( d + i + 4 ) = *( s + i + 1 );
*( d + i + 5 ) = *( s + i + 3 );
*( d + i + 6 ) = *( s + i + 5 );
*( d + i + 7 ) = *( s + i + 7 );
}
}
#endif // __AVX2__
#endif // AVXDEFS_H__
#endif // AVXDEFS_H__

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.8.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.6.1'
PACKAGE_STRING='cpuminer-opt 3.8.6.1'
PACKAGE_VERSION='3.8.8'
PACKAGE_STRING='cpuminer-opt 3.8.8'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.6.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.8.8 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.6.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.8.8:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.8.6.1
cpuminer-opt configure 3.8.8
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.6.1, which was
It was created by cpuminer-opt $as_me 3.8.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.8.6.1'
VERSION='3.8.8'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.8.6.1, which was
This file was extended by cpuminer-opt $as_me 3.8.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.8.6.1
cpuminer-opt config.status 3.8.8
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.6.1])
AC_INIT([cpuminer-opt], [3.8.8])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -2999,45 +2999,37 @@ static void show_credits()
bool check_cpu_capability ()
{
char cpu_brand[0x40];
// there is no CPU related feature specific to 4way, just AVX2 and AES
bool cpu_has_sse2 = has_sse2();
bool cpu_has_aes = has_aes_ni();
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx1();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_sha = has_sha();
// no need to check if sw has sse2,
// the code won't compile without it.
// bool sw_has_sse2 = false;
bool sw_has_aes = false;
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_sha = false;
// bool sw_has_4way = false;
bool cpu_has_sse2 = has_sse2();
bool cpu_has_aes = has_aes_ni();
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx1();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_sha = has_sha();
bool cpu_has_avx512 = has_avx512f();
bool sw_has_aes = false;
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_sha = false;
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
// bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features );
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
bool use_aes;
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_sha;
// bool use_4way;
bool use_none;
#ifdef __AES__
sw_has_aes = true;
#endif
// #ifdef __SSE2__
// sw_has_sse2 = true;
// #endif
#ifdef __SSE4_2__
sw_has_sse42 = true;
#endif
@@ -3047,12 +3039,12 @@ bool check_cpu_capability ()
#ifdef __AVX2__
sw_has_avx2 = true;
#endif
#ifdef __AVX512F__
sw_has_avx512 = true;
#endif
#ifdef __SHA__
sw_has_sha = true;
#endif
// #ifdef HASH_4WAY
// sw_has_4way = true;
// #endif
#if !((__AES__) || (__SSE2__))
printf("Neither __AES__ nor __SSE2__ defined.\n");
@@ -3072,33 +3064,33 @@ bool check_cpu_capability ()
#endif
printf("CPU features:");
if ( cpu_has_sse2 ) printf( " SSE2" );
if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sse42 ) printf( " SSE4.2" );
if ( cpu_has_avx ) printf( " AVX" );
if ( cpu_has_avx2 ) printf( " AVX2" );
if ( cpu_has_sha ) printf( " SHA" );
if ( cpu_has_sse2 ) printf( " SSE2" );
if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sse42 ) printf( " SSE4.2" );
if ( cpu_has_avx ) printf( " AVX" );
if ( cpu_has_avx2 ) printf( " AVX2" );
if ( cpu_has_avx512 ) printf( " AVX512" );
if ( cpu_has_sha ) printf( " SHA" );
printf(".\nSW features: SSE2");
if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sse42 ) printf( " SSE4.2" );
if ( sw_has_avx ) printf( " AVX" );
if ( sw_has_avx2 ) printf( " AVX2" );
// if ( sw_has_4way ) printf( " 4WAY" );
if ( sw_has_sha ) printf( " SHA" );
if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sse42 ) printf( " SSE4.2" );
if ( sw_has_avx ) printf( " AVX" );
if ( sw_has_avx2 ) printf( " AVX2" );
if ( sw_has_avx512 ) printf( " AVX512" );
if ( sw_has_sha ) printf( " SHA" );
printf(".\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
if ( algo_has_sse2 ) printf( " SSE2" );
if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sse42 ) printf( " SSE4.2" );
if ( algo_has_avx ) printf( " AVX" );
if ( algo_has_avx2 ) printf( " AVX2" );
// if ( algo_has_4way ) printf( " 4WAY" );
if ( algo_has_sha ) printf( " SHA" );
if ( algo_has_sse2 ) printf( " SSE2" );
if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sse42 ) printf( " SSE4.2" );
if ( algo_has_avx2 ) printf( " AVX2" );
if ( algo_has_avx512 ) printf( " AVX512" );
if ( algo_has_sha ) printf( " SHA" );
}
printf(".\n");
@@ -3118,11 +3110,6 @@ bool check_cpu_capability ()
printf( "The SW build requires a CPU with SSE4.2!\n" );
return false;
}
if ( sw_has_avx && !cpu_has_avx )
{
printf( "The SW build requires a CPU with AVX!\n" );
return false;
}
if ( sw_has_aes && !cpu_has_aes )
{
printf( "The SW build requires a CPU with AES!\n" );
@@ -3135,13 +3122,13 @@ bool check_cpu_capability ()
}
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx || use_avx2 ||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
use_sha );
// Display best options
@@ -3149,12 +3136,12 @@ bool check_cpu_capability ()
if ( use_none ) printf( " no optimizations" );
else
{
if ( use_aes ) printf( " AES" );
if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
if ( use_aes ) printf( " AES" );
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_sse42 ) printf( " SSE4.2" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_sha ) printf( " SHA" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_sha ) printf( " SHA" );
}
printf( ".\n\n" );

1372
interleave.h Normal file

File diff suppressed because it is too large Load Diff

155
miner.h
View File

@@ -333,6 +333,7 @@ bool has_sha();
bool has_aes_ni();
bool has_avx1();
bool has_avx2();
bool has_avx512f();
bool has_sse2();
bool has_xop();
bool has_fma3();
@@ -485,8 +486,9 @@ enum algos {
ALGO_ALLIUM,
ALGO_ANIME,
ALGO_ARGON2,
ALGO_ARGON2DCRDS,
ALGO_ARGON2DDYN,
ALGO_ARGON2D250,
ALGO_ARGON2D500,
ALGO_ARGON2D4096,
ALGO_AXIOM,
ALGO_BASTION,
ALGO_BLAKE,
@@ -496,7 +498,8 @@ enum algos {
ALGO_BMW,
ALGO_C11,
ALGO_CRYPTOLIGHT,
ALGO_CRYPTONIGHT,
ALGO_CRYPTONIGHT,
ALGO_CRYPTONIGHTV7,
ALGO_DECRED,
ALGO_DEEP,
ALGO_DMD_GR,
@@ -565,8 +568,9 @@ static const char* const algo_names[] = {
"allium",
"anime",
"argon2",
"argon2d-crds",
"argon2d-dyn",
"argon2d250",
"argon2d500",
"argon2d4096",
"axiom",
"bastion",
"blake",
@@ -577,6 +581,7 @@ static const char* const algo_names[] = {
"c11",
"cryptolight",
"cryptonight",
"cryptonightv7",
"decred",
"deep",
"dmd-gr",
@@ -701,82 +706,84 @@ static char const usage[] = "\
Usage: " PACKAGE_NAME " [OPTIONS]\n\
Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
allium Garlicoin (GRLC)\n\
anime Animecoin (ANI)\n\
argon2 Argon2 Coin (AR2)\n\
argon2d-crds Credits (CRDS)\n\
argon2d-dyn Dynamic (DYN)\n\
axiom Shabal-256 MemoHash\n\
allium Garlicoin (GRLC)\n\
anime Animecoin (ANI)\n\
argon2 Argon2 Coin (AR2)\n\
argon2d250 argon2d-crds, Credits (CRDS)\n\
argon2d500 argon2d-dyn, Dynamic (DYN)\n\
argon2d4096 argon2d-uis, Unitus (UIS)\n\
axiom Shabal-256 MemoHash\n\
bastion\n\
blake blake256r14 (SFR)\n\
blakecoin blake256r8\n\
blake2s Blake-2 S\n\
bmw BMW 256\n\
c11 Chaincoin\n\
cryptolight Cryptonight-light\n\
cryptonight cryptonote, Monero (XMR)\n\
decred Blake256r14dcr\n\
deep Deepcoin (DCN)\n\
dmd-gr Diamond\n\
drop Dropcoin\n\
fresh Fresh\n\
groestl Groestl coin\n\
heavy Heavy\n\
hmq1725 Espers\n\
hodl Hodlcoin\n\
jha jackppot (Jackpotcoin)\n\
keccak Maxcoin\n\
keccakc Creative Coin\n\
lbry LBC, LBRY Credits\n\
luffa Luffa\n\
lyra2h Hppcoin\n\
lyra2re lyra2\n\
lyra2rev2 lyrav2, Vertcoin\n\
lyra2z Zcoin (XZC)\n\
lyra2z330 Lyra2 330 rows, Zoin (ZOI)\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\
phi1612 phi, LUX coin\n\
pluck Pluck:128 (Supcoin)\n\
blake blake256r14 (SFR)\n\
blakecoin blake256r8\n\
blake2s Blake-2 S\n\
bmw BMW 256\n\
c11 Chaincoin\n\
cryptolight Cryptonight-light\n\
cryptonight Cryptonote legacy\n\
cryptonightv7 variant 7, Monero (XMR)\n\
decred Blake256r14dcr\n\
deep Deepcoin (DCN)\n\
dmd-gr Diamond\n\
drop Dropcoin\n\
fresh Fresh\n\
groestl Groestl coin\n\
heavy Heavy\n\
hmq1725 Espers\n\
hodl Hodlcoin\n\
jha jackppot (Jackpotcoin)\n\
keccak Maxcoin\n\
keccakc Creative Coin\n\
lbry LBC, LBRY Credits\n\
luffa Luffa\n\
lyra2h Hppcoin\n\
lyra2re lyra2\n\
lyra2rev2 lyrav2, Vertcoin\n\
lyra2z Zcoin (XZC)\n\
lyra2z330 Lyra2 330 rows, Zoin (ZOI)\n\
m7m Magi (XMG)\n\
myr-gr Myriad-Groestl\n\
neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\
pentablake 5 x blake512\n\
phi1612 phi, LUX coin\n\
pluck Pluck:128 (Supcoin)\n\
polytimos\n\
quark Quark\n\
qubit Qubit\n\
scrypt scrypt(1024, 1, 1) (default)\n\
scrypt:N scrypt(N, 1, 1)\n\
quark Quark\n\
qubit Qubit\n\
scrypt scrypt(1024, 1, 1) (default)\n\
scrypt:N scrypt(N, 1, 1)\n\
scryptjane:nf\n\
sha256d Double SHA-256\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Signatum (SIGT)\n\
timetravel timeravel8, Machinecoin (MAC)\n\
timetravel10 Bitcore (BTX)\n\
tribus Denarius (DNR)\n\
vanilla blake256r8vnl (VCash)\n\
sha256d Double SHA-256\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Signatum (SIGT)\n\
timetravel timeravel8, Machinecoin (MAC)\n\
timetravel10 Bitcore (BTX)\n\
tribus Denarius (DNR)\n\
vanilla blake256r8vnl (VCash)\n\
veltor\n\
whirlpool\n\
whirlpoolx\n\
x11 Dash\n\
x11evo Revolvercoin (XRE)\n\
x11gost sib (SibCoin)\n\
x12 Galaxie Cash (GCH)\n\
x13 X13\n\
x13sm3 hsr (Hshare)\n\
x14 X14\n\
x15 X15\n\
x16r Ravencoin (RVN)\n\
x16s Pigeoncoin (PGN)\n\
x11 Dash\n\
x11evo Revolvercoin (XRE)\n\
x11gost sib (SibCoin)\n\
x12 Galaxie Cash (GCH)\n\
x13 X13\n\
x13sm3 hsr (Hshare)\n\
x14 X14\n\
x15 X15\n\
x16r Ravencoin (RVN)\n\
x16s Pigeoncoin (PGN)\n\
x17\n\
xevan Bitsend (BSD)\n\
yescrypt Globlboost-Y (BSTY)\n\
yescryptr8 BitZeny (ZNY)\n\
yescryptr16 Yenten (YTN)\n\
yescryptr32 WAVI\n\
zr5 Ziftr\n\
xevan Bitsend (BSD)\n\
yescrypt Globlboost-Y (BSTY)\n\
yescryptr8 BitZeny (ZNY)\n\
yescryptr16 Yenten (YTN)\n\
yescryptr32 WAVI\n\
zr5 Ziftr\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\

View File

@@ -274,6 +274,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
#define SSE2_Flag (1<<26)
#define AVX2_Flag (1<< 5) // ADV EBX
#define AVX512F_Flag (1<<16)
#define SHA_Flag (1<<29)
// Use this to detect presence of feature
@@ -350,6 +351,21 @@ static inline bool has_avx2_()
bool has_avx2() { return has_avx2_(); }
static inline bool has_avx512f_()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512F_Flag;
#endif
}
bool has_avx512f() { return has_avx512f_(); }
// AMD only
static inline bool has_xop_()
{
#ifdef __arm__

View File

@@ -51,11 +51,13 @@ rm -f config.status
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-avx.exe
mv cpuminer.exe release/cpuminer-avx.exe
# -march=westmere is supported in gcc5
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
CFLAGS="-O3 -march=westmere -Wall" ./configure $F
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-sse42.exe