mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
eaa4bd8152 | ||
![]() |
9edc650042 | ||
![]() |
218cef337a | ||
![]() |
9ffce7bdb7 |
142
README.md
142
README.md
@@ -45,82 +45,84 @@ MacOS, OSx and Android are not supported.
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
|
||||
allium Garlicoin
|
||||
anime Animecoin
|
||||
argon2 Argon2 coin (AR2)
|
||||
argon2d-crds Credits (CRDS)
|
||||
argon2d-dyn Dynamic (DYN)
|
||||
axiom Shabal-256 MemoHash
|
||||
allium Garlicoin
|
||||
anime Animecoin
|
||||
argon2 Argon2 coin (AR2)
|
||||
argon2d250 argon2d-crds, Credits (CRDS)
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
blake Blake-256 (SFR)
|
||||
blakecoin blake256r8
|
||||
blake2s Blake-2 S
|
||||
bmw BMW 256
|
||||
c11 Chaincoin
|
||||
cryptolight Cryptonight-light
|
||||
cryptonight cryptonote, Monero (XMR)
|
||||
blake Blake-256 (SFR)
|
||||
blakecoin blake256r8
|
||||
blake2s Blake-2 S
|
||||
bmw BMW 256
|
||||
c11 Chaincoin
|
||||
cryptolight Cryptonight-light
|
||||
cryptonight
|
||||
cryptonightv7 Monero (XMR)
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
drop Dropcoin
|
||||
fresh Fresh
|
||||
groestl Groestl coin
|
||||
heavy Heavy
|
||||
hmq1725 Espers
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
keccak Maxcoin
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi, LUX coin
|
||||
pluck Pluck:128 (Supcoin)
|
||||
polytimos Ninja
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
drop Dropcoin
|
||||
fresh Fresh
|
||||
groestl Groestl coin
|
||||
heavy Heavy
|
||||
hmq1725 Espers
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
keccak Maxcoin
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
myr-gr Myriad-Groestl
|
||||
neoscrypt NeoScrypt(128, 2, 1)
|
||||
nist5 Nist5
|
||||
pentablake Pentablake
|
||||
phi1612 phi, LUX coin
|
||||
pluck Pluck:128 (Supcoin)
|
||||
polytimos Ninja
|
||||
quark Quark
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptjane:nf
|
||||
sha256d Double SHA-256
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
timetravel Machinecoin (MAC)
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor (VLT)
|
||||
sha256d Double SHA-256
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
timetravel Machinecoin (MAC)
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor (VLT)
|
||||
whirlpool
|
||||
whirlpoolx
|
||||
x11 Dash
|
||||
x11evo Revolvercoin
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin (RVN)
|
||||
x16s pigeoncoin (PGN)
|
||||
x11 Dash
|
||||
x11evo Revolvercoin
|
||||
x11gost sib (SibCoin)
|
||||
x12 Galaxie Cash (GCH)
|
||||
x13 X13
|
||||
x13sm3 hsr (Hshare)
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r Ravencoin (RVN)
|
||||
x16s pigeoncoin (PGN)
|
||||
x17
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
yescryptr16 Yenten (YTN)
|
||||
yescryptr32 WAVI
|
||||
zr5 Ziftr
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
yescryptr16 Yenten (YTN)
|
||||
yescryptr32 WAVI
|
||||
zr5 Ziftr
|
||||
|
||||
Errata
|
||||
------
|
||||
|
14
README.txt
14
README.txt
@@ -21,14 +21,16 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
|
||||
cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell...
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere, Sandy-Ivybridge
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
|
@@ -81,7 +81,7 @@ cd cpuminer-opt-x.y.z
|
||||
Run ./build.sh to build on Linux or execute the following commands.
|
||||
|
||||
./autogen.sh
|
||||
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make
|
||||
|
||||
Additional optional compile flags, add the following to CFLAGS to activate:
|
||||
@@ -160,6 +160,29 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.8.8
|
||||
|
||||
Added cryptonightv7 for Monero.
|
||||
|
||||
v3.8.7.2
|
||||
|
||||
Fixed argon2d-dyn regression in v3.8.7.1.
|
||||
Changed compile options for aes-sse42 Windows build to -march=westmere
|
||||
|
||||
v3.8.7.1
|
||||
|
||||
Fixed argon2d-uis low difficulty rejects.
|
||||
Fixed argon2d aliases.
|
||||
|
||||
v3.8.7
|
||||
|
||||
Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
|
||||
argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
|
||||
The old names are recognized as aliases.
|
||||
AVX512 is now supported for argon2d algos, Linux only.
|
||||
AVX is no longer a reported feature and an AVX Windows binary is no longer
|
||||
provided. Use AES-SSE42 build instead.
|
||||
|
||||
v3.8.6.1
|
||||
|
||||
Faster argon2d* AVX2.
|
||||
|
153
algo-gate-api.c
153
algo-gate-api.c
@@ -157,81 +157,83 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
switch (algo)
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2DCRDS: register_argon2d_crds_algo( gate ); break;
|
||||
case ALGO_ARGON2DDYN: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHT: register_cryptonight_algo ( gate ); break;
|
||||
case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
@@ -288,6 +290,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
|
@@ -2,6 +2,8 @@
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "miner.h"
|
||||
#include "avxdefs.h"
|
||||
#include "interleave.h"
|
||||
|
||||
/////////////////////////////
|
||||
////
|
||||
@@ -91,6 +93,7 @@ typedef uint32_t set_t;
|
||||
#define AVX_OPT 8
|
||||
#define AVX2_OPT 0x10
|
||||
#define SHA_OPT 0x20
|
||||
#define AVX512_OPT 0x40
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
|
@@ -28,6 +28,7 @@ void argon2d_crds_hash( void *output, const void *input )
|
||||
context.lanes = 4; // Degree of Parallelism
|
||||
context.threads = 1; // Threads
|
||||
context.t_cost = 1; // Iterations
|
||||
context.version = ARGON2_VERSION_10;
|
||||
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
@@ -70,7 +71,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Dynamic
|
||||
@@ -96,6 +98,7 @@ void argon2d_dyn_hash( void *output, const void *input )
|
||||
context.lanes = 8; // Degree of Parallelism
|
||||
context.threads = 1; // Threads
|
||||
context.t_cost = 2; // Iterations
|
||||
context.version = ARGON2_VERSION_10;
|
||||
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
@@ -138,6 +141,58 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Unitus
|
||||
|
||||
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
uint32_t t_cost = 1; // 1 iteration
|
||||
uint32_t m_cost = 4096; // use 4MB
|
||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
|
||||
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int64_t get_max64_0x1ff() { return 0x1ff; }
|
||||
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ff;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
// Credits
|
||||
// Credits: version = 0x10, m_cost = 250.
|
||||
bool register_argon2d_crds_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_crds_hash( void *state, const void *input );
|
||||
@@ -12,7 +12,7 @@ void argon2d_crds_hash( void *state, const void *input );
|
||||
int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
// Dynamic
|
||||
// Dynamic: version = 0x10, m_cost = 500.
|
||||
bool register_argon2d_dyn_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_dyn_hash( void *state, const void *input );
|
||||
@@ -21,5 +21,11 @@ int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
|
||||
// Unitus: version = 0x13, m_cost = 4096.
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate );
|
||||
|
||||
int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -180,60 +180,65 @@ int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, const size_t hashlen,
|
||||
char *encoded, const size_t encodedlen) {
|
||||
char *encoded, const size_t encodedlen,
|
||||
const uint32_t version) {
|
||||
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
NULL, hashlen, encoded, encodedlen, Argon2_i,
|
||||
ARGON2_VERSION_NUMBER);
|
||||
version );
|
||||
}
|
||||
|
||||
int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash, const size_t hashlen) {
|
||||
const size_t saltlen, void *hash, const size_t hashlen,
|
||||
const uint32_t version ) {
|
||||
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
hash, hashlen, NULL, 0, Argon2_i, ARGON2_VERSION_NUMBER);
|
||||
hash, hashlen, NULL, 0, Argon2_i, version );
|
||||
}
|
||||
|
||||
int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, const size_t hashlen,
|
||||
char *encoded, const size_t encodedlen) {
|
||||
char *encoded, const size_t encodedlen,
|
||||
const uint32_t version ) {
|
||||
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
NULL, hashlen, encoded, encodedlen, Argon2_d,
|
||||
ARGON2_VERSION_NUMBER);
|
||||
version );
|
||||
}
|
||||
|
||||
int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash, const size_t hashlen) {
|
||||
const size_t saltlen, void *hash, const size_t hashlen,
|
||||
const uint32_t version ) {
|
||||
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
hash, hashlen, NULL, 0, Argon2_d, ARGON2_VERSION_NUMBER);
|
||||
hash, hashlen, NULL, 0, Argon2_d, version );
|
||||
}
|
||||
|
||||
int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, const size_t hashlen,
|
||||
char *encoded, const size_t encodedlen) {
|
||||
char *encoded, const size_t encodedlen,
|
||||
const uint32_t version ) {
|
||||
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
NULL, hashlen, encoded, encodedlen, Argon2_id,
|
||||
ARGON2_VERSION_NUMBER);
|
||||
version);
|
||||
}
|
||||
|
||||
int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash, const size_t hashlen) {
|
||||
const size_t saltlen, void *hash, const size_t hashlen,
|
||||
const uint32_t version ) {
|
||||
return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
|
||||
hash, hashlen, NULL, 0, Argon2_id,
|
||||
ARGON2_VERSION_NUMBER);
|
||||
hash, hashlen, NULL, 0, Argon2_id, version );
|
||||
}
|
||||
|
||||
static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
|
||||
@@ -443,10 +448,11 @@ const char *argon2_error_message(int error_code) {
|
||||
return "Unknown error code";
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
|
||||
uint32_t saltlen, uint32_t hashlen, argon2_type type) {
|
||||
return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
|
||||
numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
|
||||
b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
|
||||
}
|
||||
*/
|
||||
|
@@ -225,11 +225,8 @@ typedef enum Argon2_type {
|
||||
} argon2_type;
|
||||
|
||||
/* Version of the algorithm */
|
||||
typedef enum Argon2_version {
|
||||
ARGON2_VERSION_10 = 0x10,
|
||||
ARGON2_VERSION_13 = 0x13,
|
||||
ARGON2_VERSION_NUMBER = ARGON2_VERSION_10
|
||||
} argon2_version;
|
||||
#define ARGON2_VERSION_10 0x10
|
||||
#define ARGON2_VERSION_13 0x13
|
||||
|
||||
/*
|
||||
* Function that gives the string representation of an argon2_type.
|
||||
@@ -267,7 +264,8 @@ ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
|
||||
const void *pwd, const size_t pwdlen,
|
||||
const void *salt, const size_t saltlen,
|
||||
const size_t hashlen, char *encoded,
|
||||
const size_t encodedlen);
|
||||
const size_t encodedlen,
|
||||
const uint32_t version );
|
||||
|
||||
/**
|
||||
* Hashes a password with Argon2i, producing a raw hash at @hash
|
||||
@@ -287,7 +285,8 @@ ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash,
|
||||
const size_t hashlen);
|
||||
const size_t hashlen,
|
||||
const uint32_t version );
|
||||
|
||||
ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
|
||||
const uint32_t m_cost,
|
||||
@@ -295,13 +294,15 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
|
||||
const void *pwd, const size_t pwdlen,
|
||||
const void *salt, const size_t saltlen,
|
||||
const size_t hashlen, char *encoded,
|
||||
const size_t encodedlen);
|
||||
const size_t encodedlen,
|
||||
const uint32_t version );
|
||||
|
||||
ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash,
|
||||
const size_t hashlen);
|
||||
const size_t hashlen,
|
||||
const uint32_t version );
|
||||
|
||||
ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
|
||||
const uint32_t m_cost,
|
||||
@@ -309,14 +310,16 @@ ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
|
||||
const void *pwd, const size_t pwdlen,
|
||||
const void *salt, const size_t saltlen,
|
||||
const size_t hashlen, char *encoded,
|
||||
const size_t encodedlen);
|
||||
const size_t encodedlen,
|
||||
const uint32_t version );
|
||||
|
||||
ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
|
||||
const uint32_t m_cost,
|
||||
const uint32_t parallelism, const void *pwd,
|
||||
const size_t pwdlen, const void *salt,
|
||||
const size_t saltlen, void *hash,
|
||||
const size_t hashlen);
|
||||
const size_t hashlen,
|
||||
const uint32_t version );
|
||||
|
||||
/* generic function underlying the above ones */
|
||||
ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
|
||||
@@ -325,7 +328,7 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
|
||||
const size_t saltlen, void *hash,
|
||||
const size_t hashlen, char *encoded,
|
||||
const size_t encodedlen, argon2_type type,
|
||||
const uint32_t version);
|
||||
const uint32_t version );
|
||||
|
||||
/**
|
||||
* Verifies a password against an encoded string
|
||||
|
@@ -544,7 +544,8 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
store32(&value, context->t_cost);
|
||||
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
store32(&value, ARGON2_VERSION_NUMBER);
|
||||
// store32(&value, ARGON2_VERSION_NUMBER);
|
||||
store32(&value, context->version);
|
||||
blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
store32(&value, (uint32_t)type);
|
||||
|
@@ -345,15 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
|
||||
ref_block =
|
||||
instance->memory + instance->lane_length * ref_lane + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
// if (ARGON2_VERSION_10 == instance->version) {
|
||||
// /* version 1.2.1 and earlier: overwrite, not XOR */
|
||||
// fill_block(state, ref_block, curr_block, 0);
|
||||
// } else {
|
||||
// if(0 == position.pass) {
|
||||
if (ARGON2_VERSION_10 == instance->version) {
|
||||
/* version 1.2.1 and earlier: overwrite, not XOR */
|
||||
fill_block(state, ref_block, curr_block, 0);
|
||||
} else {
|
||||
if(0 == position.pass) {
|
||||
fill_block(state, ref_block, curr_block, 0);
|
||||
// } else {
|
||||
// fill_block(state, ref_block, curr_block, 1);
|
||||
// }
|
||||
// }
|
||||
} else {
|
||||
fill_block(state, ref_block, curr_block, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -325,7 +325,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,
|
||||
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
do {
|
||||
*nonceptr = ++n;
|
||||
cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
|
||||
|
@@ -1,14 +1,11 @@
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include <memory.h>
|
||||
#include "cryptonight.h"
|
||||
#include "miner.h"
|
||||
#include "crypto/c_keccak.h"
|
||||
#include <immintrin.h>
|
||||
//#include "avxdefs.h"
|
||||
|
||||
void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
|
||||
void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
|
||||
void that_fucking_loop(uint8_t a[16], uint8_t b[16], uint8_t *long_state);
|
||||
|
||||
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
{
|
||||
@@ -25,7 +22,6 @@ static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
|
||||
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
__m128i tmp2, tmp4;
|
||||
|
||||
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
|
||||
@@ -37,14 +33,12 @@ static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Special thanks to Intel for helping me
|
||||
// with ExpandAESKey256() and its subroutines
|
||||
static inline void ExpandAESKey256(char *keybuf)
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
__m128i tmp1, tmp2, tmp3, *keys;
|
||||
|
||||
keys = (__m128i *)keybuf;
|
||||
@@ -91,7 +85,6 @@ static inline void ExpandAESKey256(char *keybuf)
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[14] = tmp1;
|
||||
#endif
|
||||
}
|
||||
|
||||
// align to 64 byte cache line
|
||||
@@ -109,13 +102,19 @@ static __thread cryptonight_ctx ctx;
|
||||
|
||||
void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
{
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
uint8_t ExpandedKey[256] __attribute__((aligned(64)));
|
||||
__m128i *longoutput, *expkey, *xmminput;
|
||||
size_t i, j;
|
||||
|
||||
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
||||
|
||||
if ( cryptonightV7 && len < 43 )
|
||||
return;
|
||||
|
||||
const uint64_t tweak = cryptonightV7
|
||||
? *((const uint64_t*) (((const uint8_t*)input) + 35))
|
||||
^ ctx.state.hs.w[24] : 0;
|
||||
|
||||
memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
|
||||
ExpandAESKey256( ExpandedKey );
|
||||
memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
|
||||
@@ -214,7 +213,15 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
_mm_store_si128( (__m128i*)c, c_x );
|
||||
b_x = _mm_xor_si128( b_x, c_x );
|
||||
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||
_mm_store_si128( lsa, b_x );
|
||||
_mm_store_si128( lsa, b_x );
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
b[0] = nextblock[0];
|
||||
b[1] = nextblock[1];
|
||||
|
||||
@@ -227,10 +234,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
: "cc" );
|
||||
|
||||
b_x = c_x;
|
||||
nextblock[0] = a[0] + hi;
|
||||
nextblock[1] = a[1] + lo;
|
||||
a[0] = b[0] ^ nextblock[0];
|
||||
a[1] = b[1] ^ nextblock[1];
|
||||
|
||||
a[0] += hi;
|
||||
a[1] += lo;
|
||||
nextblock[0] = a[0];
|
||||
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
|
||||
lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
|
||||
a_x = _mm_load_si128( (__m128i*)a );
|
||||
c_x = _mm_load_si128( lsa );
|
||||
@@ -241,6 +252,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
b_x = _mm_xor_si128( b_x, c_x );
|
||||
nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
|
||||
_mm_store_si128( lsa, b_x );
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
b[0] = nextblock[0];
|
||||
b[1] = nextblock[1];
|
||||
|
||||
@@ -251,8 +270,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
"rm" ( b[0] )
|
||||
: "cc" );
|
||||
|
||||
nextblock[0] = a[0] + hi;
|
||||
nextblock[1] = a[1] + lo;
|
||||
a[0] += hi;
|
||||
a[1] += lo;
|
||||
nextblock[0] = a[0];
|
||||
nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
|
||||
a[0] ^= b[0];
|
||||
a[1] ^= b[1];
|
||||
|
||||
memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
|
||||
ExpandAESKey256( ExpandedKey );
|
||||
@@ -330,5 +353,5 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
|
||||
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
||||
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@@ -7,11 +7,11 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#endif
|
||||
|
||||
#else
|
||||
#include "crypto/c_groestl.h"
|
||||
#endif
|
||||
#include "crypto/c_blake256.h"
|
||||
#include "crypto/c_jh.h"
|
||||
#include "crypto/c_skein.h"
|
||||
@@ -30,12 +30,12 @@ void do_blake_hash(const void* input, size_t len, char* output) {
|
||||
}
|
||||
|
||||
void do_groestl_hash(const void* input, size_t len, char* output) {
|
||||
#ifdef NO_AES_NI
|
||||
groestl(input, len * 8, (uint8_t*)output);
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
hashState_groestl256 ctx;
|
||||
init_groestl256( &ctx, 32 );
|
||||
update_and_final_groestl256( &ctx, output, input, len * 8 );
|
||||
#else
|
||||
groestl(input, len * 8, (uint8_t*)output);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -52,23 +52,24 @@ void (* const extra_hashes[4])( const void *, size_t, char *) =
|
||||
|
||||
void cryptonight_hash( void *restrict output, const void *input, int len )
|
||||
{
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
cryptonight_hash_ctx ( output, input, len );
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
cryptonight_hash_aes( output, input, len );
|
||||
#else
|
||||
cryptonight_hash_ctx ( output, input, len );
|
||||
#endif
|
||||
}
|
||||
|
||||
void cryptonight_hash_suw( void *restrict output, const void *input )
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
cryptonight_hash_ctx ( output, input, 76 );
|
||||
#else
|
||||
#if defined(__AES__)
|
||||
cryptonight_hash_aes( output, input, 76 );
|
||||
#else
|
||||
cryptonight_hash_ctx ( output, input, 76 );
|
||||
#endif
|
||||
}
|
||||
|
||||
bool cryptonightV7 = false;
|
||||
|
||||
int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
@@ -80,6 +81,11 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = n + 1;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[32 / 4] __attribute__((aligned(32)));
|
||||
|
||||
if ( ( cryptonightV7 && ( *(uint8_t*)pdata < 7 ) )
|
||||
|| ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
|
||||
applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
|
||||
|
||||
do
|
||||
{
|
||||
*nonceptr = ++n;
|
||||
@@ -97,6 +103,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
{
|
||||
cryptonightV7 = false;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
@@ -106,3 +113,15 @@ bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_cryptonightv7_algo( algo_gate_t* gate )
|
||||
{
|
||||
cryptonightV7 = true;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptonight;
|
||||
gate->hash = (void*)&cryptonight_hash;
|
||||
gate->hash_suw = (void*)&cryptonight_hash_suw;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -20,8 +20,8 @@
|
||||
#include "crypto/c_jh.h"
|
||||
#include "crypto/c_skein.h"
|
||||
#include "crypto/int-util.h"
|
||||
#include "crypto/hash-ops.h"
|
||||
//#include "cryptonight.h"
|
||||
//#include "crypto/hash-ops.h"
|
||||
#include "cryptonight.h"
|
||||
|
||||
#if USE_INT128
|
||||
|
||||
@@ -51,6 +51,7 @@ typedef __uint128_t uint128_t;
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
/*
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state {
|
||||
union hash_state hs;
|
||||
@@ -78,6 +79,7 @@ static void do_skein_hash(const void* input, size_t len, char* output) {
|
||||
int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
|
||||
assert(likely(SKEIN_SUCCESS == r));
|
||||
}
|
||||
*/
|
||||
|
||||
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
|
||||
@@ -120,9 +122,11 @@ static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* pro
|
||||
extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
|
||||
#endif
|
||||
|
||||
/*
|
||||
static void (* const extra_hashes[4])(const void *, size_t, char *) = {
|
||||
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
|
||||
};
|
||||
*/
|
||||
|
||||
static inline size_t e2i(const uint8_t* a) {
|
||||
#if !LITE
|
||||
@@ -132,14 +136,16 @@ static inline size_t e2i(const uint8_t* a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
|
||||
static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst,
|
||||
const uint64_t tweak )
|
||||
{
|
||||
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||
hi += ((uint64_t*) c)[0];
|
||||
|
||||
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||
((uint64_t*) dst)[0] = hi;
|
||||
((uint64_t*) dst)[1] = lo;
|
||||
((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
|
||||
}
|
||||
|
||||
static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
|
||||
@@ -174,8 +180,16 @@ static __thread cryptonight_ctx ctx;
|
||||
|
||||
void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
{
|
||||
hash_process(&ctx.state.hs, (const uint8_t*) input, len);
|
||||
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
// hash_process(&ctx.state.hs, (const uint8_t*) input, len);
|
||||
keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
|
||||
|
||||
if ( cryptonightV7 && len < 43 )
|
||||
return;
|
||||
const uint64_t tweak = cryptonightV7
|
||||
? *((const uint64_t*) (((const uint8_t*)input) + 35))
|
||||
^ ctx.state.hs.w[24] : 0;
|
||||
|
||||
ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
__builtin_prefetch( ctx.text + 64, 0, 3 );
|
||||
@@ -211,23 +225,44 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
|
||||
xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
|
||||
|
||||
for (i = 0; likely(i < ITER / 4); ++i) {
|
||||
/* Dependency chain: address -> read value ------+
|
||||
* written value <-+ hard function (AES or MUL) <+
|
||||
* next address <-+
|
||||
*/
|
||||
/* Iteration 1 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
|
||||
xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
|
||||
/* Iteration 2 */
|
||||
mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)]);
|
||||
/* Iteration 3 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
|
||||
xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
|
||||
/* Iteration 4 */
|
||||
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
|
||||
for (i = 0; likely(i < ITER / 4); ++i)
|
||||
{
|
||||
/* Dependency chain: address -> read value ------+
|
||||
* written value <-+ hard function (AES or MUL) <+
|
||||
* next address <-+
|
||||
*/
|
||||
/* Iteration 1 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
|
||||
xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
|
||||
const uint8_t tmp = lsa[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
/* Iteration 2 */
|
||||
mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
|
||||
|
||||
/* Iteration 3 */
|
||||
j = e2i(ctx.a);
|
||||
aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
|
||||
xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
|
||||
|
||||
if ( cryptonightV7 )
|
||||
{
|
||||
uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
|
||||
const uint8_t tmp = lsa[11];
|
||||
const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
|
||||
lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
|
||||
}
|
||||
|
||||
/* Iteration 4 */
|
||||
mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
|
||||
|
||||
}
|
||||
|
||||
__builtin_prefetch( ctx.text, 0, 3 );
|
||||
@@ -266,7 +301,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
|
||||
aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
|
||||
}
|
||||
memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
|
||||
hash_permutation(&ctx.state.hs);
|
||||
// hash_permutation(&ctx.state.hs);
|
||||
keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
|
||||
/*memcpy(hash, &state, 32);*/
|
||||
extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
|
||||
oaes_free((OAES_CTX **) &ctx.aes_ctx);
|
||||
|
@@ -45,5 +45,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
void cryptonight_hash_aes( void *restrict output, const void *input, int len );
|
||||
|
||||
extern bool cryptonightV7;
|
||||
|
||||
#endif
|
||||
|
||||
|
766
avxdefs.h
766
avxdefs.h
@@ -1,5 +1,5 @@
|
||||
#ifndef AVXDEFS_H__
|
||||
#define AVXDEFS_H__
|
||||
#define AVXDEFS_H__ 1
|
||||
|
||||
// Some tools to help using SIMD vectors.
|
||||
//
|
||||
@@ -1034,6 +1034,11 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
//
|
||||
// Pseudo constants.
|
||||
|
||||
// _mm512_setzero_si512 uses xor instruction. If needed frequently
|
||||
// in a function it's better to define a register variable (const?)
|
||||
// initialized to zero.
|
||||
// It isn't clear to me yet how set or set1 work.
|
||||
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
#define m512_one_512 _mm512_set_epi64x( 0ULL, 0ULL, 0ULL, 0ULL, \
|
||||
0ULL, 0ULL, 0ULL, 1ULL )
|
||||
@@ -1058,6 +1063,21 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
// p = any aligned pointer
|
||||
// i = scaled array index
|
||||
// o = scaled address offset
|
||||
|
||||
// returns p as pointer to vector
|
||||
#define castp_m512i(p) ((__m512i*)(p))
|
||||
|
||||
// returns *p as vector value
|
||||
#define cast_m512i(p) (*((__m512i*)(p)))
|
||||
|
||||
// returns p[i] as vector value
|
||||
#define casti_m512i(p,i) (((__m512i*)(p))[(i)])
|
||||
|
||||
// returns p+o as pointer to vector
|
||||
#define casto_m512i(p,o) (((__m512i*)(p))+(o))
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -1237,746 +1257,4 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
|
||||
#endif // AVX512F
|
||||
|
||||
// Paired functions for interleaving and deinterleaving data for vector
|
||||
// processing.
|
||||
// Size is specfied in bits regardless of vector size to avoid pointer
|
||||
// arithmetic confusion with different size vectors and be consistent with
|
||||
// the function's name.
|
||||
//
|
||||
// Each function has 2 implementations, an optimized version that uses
|
||||
// vector indexing and a slower version that uses pointers. The optimized
|
||||
// version can only be used with 64 bit elements and only supports sizes
|
||||
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
|
||||
//
|
||||
// NOTE: Contrary to GCC documentation, accessing vector elements using array
|
||||
// indexes only works with 64 bit elements.
|
||||
// Interleaving and deinterleaving of vectors of 32 bit elements
|
||||
// must use the slower implementations that don't use vector indexing.
|
||||
//
|
||||
// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
|
||||
// Interleave source args and deinterleave destination args are not required
|
||||
// to be contiguous in memory but it's more efficient if they are.
|
||||
// Interleave source agrs may be the same actual arg repeated.
|
||||
// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
|
||||
// destination buffers be defined with padding up to 768 bits for overrun
|
||||
// space. Although overrun space use is non destructive it should not overlay
|
||||
// useful data and should be ignored by the caller.
|
||||
|
||||
// SSE2 AVX
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for 128 bit processing
|
||||
// bit_len must be 256, 512 or 640 bits.
|
||||
static inline void mm_interleave_4x32( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
__m128i* d = (__m128i*)dst;
|
||||
|
||||
d[0] = _mm_set_epi32( s3[ 0], s2[ 0], s1[ 0], s0[ 0] );
|
||||
d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
|
||||
d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
|
||||
d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
|
||||
d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
|
||||
d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
|
||||
d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
|
||||
d[7] = _mm_set_epi32( s3[ 7], s2[ 7], s1[ 7], s0[ 7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 8] = _mm_set_epi32( s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
|
||||
d[ 9] = _mm_set_epi32( s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
|
||||
d[10] = _mm_set_epi32( s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm_set_epi32( s3[11], s2[11], s1[11], s0[11] );
|
||||
d[12] = _mm_set_epi32( s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm_set_epi32( s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm_set_epi32( s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm_set_epi32( s3[15], s2[15], s1[15], s0[15] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[16] = _mm_set_epi32( s3[16], s2[16], s1[16], s0[16] );
|
||||
d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
|
||||
d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
|
||||
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
|
||||
d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
|
||||
d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
|
||||
d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
|
||||
|
||||
d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
|
||||
d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
|
||||
d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
|
||||
d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
|
||||
d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
|
||||
d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
|
||||
d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
|
||||
d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm_interleave_4x32x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
|
||||
{
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
__m128i* d0 = (__m128i*)dst0;
|
||||
__m128i* d1 = (__m128i*)dst1;
|
||||
__m128i* d2 = (__m128i*)dst2;
|
||||
__m128i* d3 = (__m128i*)dst3;
|
||||
|
||||
d0[0] = _mm_set_epi32( s[12], s[ 8], s[ 4], s[ 0] );
|
||||
d1[0] = _mm_set_epi32( s[13], s[ 9], s[ 5], s[ 1] );
|
||||
d2[0] = _mm_set_epi32( s[14], s[10], s[ 6], s[ 2] );
|
||||
d3[0] = _mm_set_epi32( s[15], s[11], s[ 7], s[ 3] );
|
||||
|
||||
d0[1] = _mm_set_epi32( s[28], s[24], s[20], s[16] );
|
||||
d1[1] = _mm_set_epi32( s[29], s[25], s[21], s[17] );
|
||||
d2[1] = _mm_set_epi32( s[30], s[26], s[22], s[18] );
|
||||
d3[1] = _mm_set_epi32( s[31], s[27], s[23], s[19] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[2] = _mm_set_epi32( s[44], s[40], s[36], s[32] );
|
||||
d1[2] = _mm_set_epi32( s[45], s[41], s[37], s[33] );
|
||||
d2[2] = _mm_set_epi32( s[46], s[42], s[38], s[34] );
|
||||
d3[2] = _mm_set_epi32( s[47], s[43], s[39], s[35] );
|
||||
|
||||
d0[3] = _mm_set_epi32( s[60], s[56], s[52], s[48] );
|
||||
d1[3] = _mm_set_epi32( s[61], s[57], s[53], s[49] );
|
||||
d2[3] = _mm_set_epi32( s[62], s[58], s[54], s[50] );
|
||||
d3[3] = _mm_set_epi32( s[63], s[59], s[55], s[51] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d0[4] = _mm_set_epi32( s[76], s[72], s[68], s[64] );
|
||||
d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
|
||||
d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
|
||||
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
|
||||
d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
|
||||
d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
|
||||
d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
|
||||
|
||||
d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
|
||||
d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
|
||||
d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
|
||||
d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
|
||||
|
||||
d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
|
||||
d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
|
||||
d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
|
||||
d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// deinterleave 4 arrays into individual buffers for scalarm processing
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
uint32_t *d0 = (uint32_t*)dst0;
|
||||
uint32_t *d1 = (uint32_t*)dst1;
|
||||
uint32_t *d2 = (uint32_t*)dst2;
|
||||
uint32_t *d3 = (uint32_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
|
||||
{
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Interleave 4 source buffers containing 64 bit data into the destination
|
||||
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
|
||||
static inline void mm256_interleave_4x64( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
|
||||
d[0] = _mm256_set_epi64x( s3[0], s2[0], s1[0], s0[0] );
|
||||
d[1] = _mm256_set_epi64x( s3[1], s2[1], s1[1], s0[1] );
|
||||
d[2] = _mm256_set_epi64x( s3[2], s2[2], s1[2], s0[2] );
|
||||
d[3] = _mm256_set_epi64x( s3[3], s2[3], s1[3], s0[3] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi64x( s3[4], s2[4], s1[4], s0[4] );
|
||||
d[5] = _mm256_set_epi64x( s3[5], s2[5], s1[5], s0[5] );
|
||||
d[6] = _mm256_set_epi64x( s3[6], s2[6], s1[6], s0[6] );
|
||||
d[7] = _mm256_set_epi64x( s3[7], s2[7], s1[7], s0[7] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
|
||||
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
|
||||
|
||||
d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
|
||||
void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, d += 4 )
|
||||
{
|
||||
*d = *(s0+i);
|
||||
*(d+1) = *(s1+i);
|
||||
*(d+2) = *(s2+i);
|
||||
*(d+3) = *(s3+i);
|
||||
}
|
||||
}
|
||||
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
// bit_len must be 256, 512, 640 or 1024 bits.
|
||||
// Requires overrun padding for 640 bit len.
|
||||
static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
__m256i* d2 = (__m256i*)dst2;
|
||||
__m256i* d3 = (__m256i*)dst3;
|
||||
uint64_t* s = (uint64_t*)src;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[12], s[ 8], s[ 4], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[13], s[ 9], s[ 5], s[ 1] );
|
||||
d2[0] = _mm256_set_epi64x( s[14], s[10], s[ 6], s[ 2] );
|
||||
d3[0] = _mm256_set_epi64x( s[15], s[11], s[ 7], s[ 3] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[28], s[24], s[20], s[16] );
|
||||
d1[1] = _mm256_set_epi64x( s[29], s[25], s[21], s[17] );
|
||||
d2[1] = _mm256_set_epi64x( s[30], s[26], s[22], s[18] );
|
||||
d3[1] = _mm256_set_epi64x( s[31], s[27], s[23], s[19] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
// null change to overrun area
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
|
||||
d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
|
||||
d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
|
||||
d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
// bit_len must be multiple 0f 64
|
||||
static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1,
|
||||
void *dst2, void *dst3, void *src, int bit_len )
|
||||
{
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
uint64_t *d2 = (uint64_t*)dst2;
|
||||
uint64_t *d3 = (uint64_t*)dst3;
|
||||
|
||||
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
|
||||
{
|
||||
*(d0+i) = *s;
|
||||
*(d1+i) = *(s+1);
|
||||
*(d2+i) = *(s+2);
|
||||
*(d3+i) = *(s+3);
|
||||
}
|
||||
}
|
||||
|
||||
// Interleave 8 source buffers containing 32 bit data into the destination
|
||||
// vector
|
||||
static inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, const void *src4,
|
||||
const void *src5, const void *src6, const void *src7, int bit_len )
|
||||
{
|
||||
uint32_t *s0 = (uint32_t*)src0;
|
||||
uint32_t *s1 = (uint32_t*)src1;
|
||||
uint32_t *s2 = (uint32_t*)src2;
|
||||
uint32_t *s3 = (uint32_t*)src3;
|
||||
uint32_t *s4 = (uint32_t*)src4;
|
||||
uint32_t *s5 = (uint32_t*)src5;
|
||||
uint32_t *s6 = (uint32_t*)src6;
|
||||
uint32_t *s7 = (uint32_t*)src7;
|
||||
__m256i *d = (__m256i*)dst;
|
||||
|
||||
d[ 0] = _mm256_set_epi32( s7[0], s6[0], s5[0], s4[0],
|
||||
s3[0], s2[0], s1[0], s0[0] );
|
||||
d[ 1] = _mm256_set_epi32( s7[1], s6[1], s5[1], s4[1],
|
||||
s3[1], s2[1], s1[1], s0[1] );
|
||||
d[ 2] = _mm256_set_epi32( s7[2], s6[2], s5[2], s4[2],
|
||||
s3[2], s2[2], s1[2], s0[2] );
|
||||
d[ 3] = _mm256_set_epi32( s7[3], s6[3], s5[3], s4[3],
|
||||
s3[3], s2[3], s1[3], s0[3] );
|
||||
d[ 4] = _mm256_set_epi32( s7[4], s6[4], s5[4], s4[4],
|
||||
s3[4], s2[4], s1[4], s0[4] );
|
||||
d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
|
||||
s3[5], s2[5], s1[5], s0[5] );
|
||||
d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
|
||||
s3[6], s2[6], s1[6], s0[6] );
|
||||
d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
|
||||
s3[7], s2[7], s1[7], s0[7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[ 8] = _mm256_set_epi32( s7[ 8], s6[ 8], s5[ 8], s4[ 8],
|
||||
s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
|
||||
d[ 9] = _mm256_set_epi32( s7[ 9], s6[ 9], s5[ 9], s4[ 9],
|
||||
s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
|
||||
d[10] = _mm256_set_epi32( s7[10], s6[10], s5[10], s4[10],
|
||||
s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm256_set_epi32( s7[11], s6[11], s5[11], s4[11],
|
||||
s3[11], s2[11], s1[11], s0[11] );
|
||||
d[12] = _mm256_set_epi32( s7[12], s6[12], s5[12], s4[12],
|
||||
s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm256_set_epi32( s7[13], s6[13], s5[13], s4[13],
|
||||
s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm256_set_epi32( s7[14], s6[14], s5[14], s4[14],
|
||||
s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm256_set_epi32( s7[15], s6[15], s5[15], s4[15],
|
||||
s3[15], s2[15], s1[15], s0[15] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[16] = _mm256_set_epi32( s7[16], s6[16], s5[16], s4[16],
|
||||
s3[16], s2[16], s1[16], s0[16] );
|
||||
d[17] = _mm256_set_epi32( s7[17], s6[17], s5[17], s4[17],
|
||||
s3[17], s2[17], s1[17], s0[17] );
|
||||
d[18] = _mm256_set_epi32( s7[18], s6[18], s5[18], s4[18],
|
||||
s3[18], s2[18], s1[18], s0[18] );
|
||||
d[19] = _mm256_set_epi32( s7[19], s6[19], s5[19], s4[19],
|
||||
s3[19], s2[19], s1[19], s0[19] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[20] = _mm256_set_epi32( s7[20], s6[20], s5[20], s4[20],
|
||||
s3[20], s2[20], s1[20], s0[20] );
|
||||
d[21] = _mm256_set_epi32( s7[21], s6[21], s5[21], s4[21],
|
||||
s3[21], s2[21], s1[21], s0[21] );
|
||||
d[22] = _mm256_set_epi32( s7[22], s6[22], s5[22], s4[22],
|
||||
s3[22], s2[22], s1[22], s0[22] );
|
||||
d[23] = _mm256_set_epi32( s7[23], s6[23], s5[23], s4[23],
|
||||
s3[23], s2[23], s1[23], s0[23] );
|
||||
|
||||
if ( bit_len <= 768 ) return;
|
||||
|
||||
d[24] = _mm256_set_epi32( s7[24], s6[24], s5[24], s4[24],
|
||||
s3[24], s2[24], s1[24], s0[24] );
|
||||
d[25] = _mm256_set_epi32( s7[25], s6[25], s5[25], s4[25],
|
||||
s3[25], s2[25], s1[25], s0[25] );
|
||||
d[26] = _mm256_set_epi32( s7[26], s6[26], s5[26], s4[26],
|
||||
s3[26], s2[26], s1[26], s0[26] );
|
||||
d[27] = _mm256_set_epi32( s7[27], s6[27], s5[27], s4[27],
|
||||
s3[27], s2[27], s1[27], s0[27] );
|
||||
d[28] = _mm256_set_epi32( s7[28], s6[28], s5[28], s4[28],
|
||||
s3[28], s2[28], s1[28], s0[28] );
|
||||
d[29] = _mm256_set_epi32( s7[29], s6[29], s5[29], s4[29],
|
||||
s3[29], s2[29], s1[29], s0[29] );
|
||||
d[30] = _mm256_set_epi32( s7[30], s6[30], s5[30], s4[30],
|
||||
s3[30], s2[30], s1[30], s0[30] );
|
||||
d[31] = _mm256_set_epi32( s7[31], s6[31], s5[31], s4[31],
|
||||
s3[31], s2[31], s1[31], s0[31] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower but it works with 32 bit data
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
|
||||
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
|
||||
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
|
||||
{
|
||||
uint32_t *d = dst;;
|
||||
for ( int i = 0; i < bit_len>>5; i++, d += 8 )
|
||||
{
|
||||
*d = *(src0+i);
|
||||
*(d+1) = *(src1+i);
|
||||
*(d+2) = *(src2+i);
|
||||
*(d+3) = *(src3+i);
|
||||
*(d+4) = *(src4+i);
|
||||
*(d+5) = *(src5+i);
|
||||
*(d+6) = *(src6+i);
|
||||
*(d+7) = *(src7+i);
|
||||
}
|
||||
}
|
||||
|
||||
// Deinterleave 8 buffers of 32 bit data from the source buffer.
|
||||
static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||
const void *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
__m256i* d0 = (__m256i*)dst0;
|
||||
__m256i* d1 = (__m256i*)dst1;
|
||||
__m256i* d2 = (__m256i*)dst2;
|
||||
__m256i* d3 = (__m256i*)dst3;
|
||||
__m256i* d4 = (__m256i*)dst4;
|
||||
__m256i* d5 = (__m256i*)dst5;
|
||||
__m256i* d6 = (__m256i*)dst6;
|
||||
__m256i* d7 = (__m256i*)dst7;
|
||||
|
||||
d0[0] = _mm256_set_epi32( s[ 56], s[ 48], s[ 40], s[ 32],
|
||||
s[ 24], s[ 16], s[ 8], s[ 0] );
|
||||
d1[0] = _mm256_set_epi32( s[ 57], s[ 49], s[ 41], s[ 33],
|
||||
s[ 25], s[ 17], s[ 9], s[ 1] );
|
||||
d2[0] = _mm256_set_epi32( s[ 58], s[ 50], s[ 42], s[ 34],
|
||||
s[ 26], s[ 18], s[ 10], s[ 2] );
|
||||
d3[0] = _mm256_set_epi32( s[ 59], s[ 51], s[ 43], s[ 35],
|
||||
s[ 27], s[ 19], s[ 11], s[ 3] );
|
||||
d4[0] = _mm256_set_epi32( s[ 60], s[ 52], s[ 44], s[ 36],
|
||||
s[ 28], s[ 20], s[ 12], s[ 4] );
|
||||
d5[0] = _mm256_set_epi32( s[ 61], s[ 53], s[ 45], s[ 37],
|
||||
s[ 29], s[ 21], s[ 13], s[ 5] );
|
||||
d6[0] = _mm256_set_epi32( s[ 62], s[ 54], s[ 46], s[ 38],
|
||||
s[ 30], s[ 22], s[ 14], s[ 6] );
|
||||
d7[0] = _mm256_set_epi32( s[ 63], s[ 55], s[ 47], s[ 39],
|
||||
s[ 31], s[ 23], s[ 15], s[ 7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi32( s[120], s[112], s[104], s[ 96],
|
||||
s[ 88], s[ 80], s[ 72], s[ 64] );
|
||||
d1[1] = _mm256_set_epi32( s[121], s[113], s[105], s[ 97],
|
||||
s[ 89], s[ 81], s[ 73], s[ 65] );
|
||||
d2[1] = _mm256_set_epi32( s[122], s[114], s[106], s[ 98],
|
||||
s[ 90], s[ 82], s[ 74], s[ 66]);
|
||||
d3[1] = _mm256_set_epi32( s[123], s[115], s[107], s[ 99],
|
||||
s[ 91], s[ 83], s[ 75], s[ 67] );
|
||||
d4[1] = _mm256_set_epi32( s[124], s[116], s[108], s[100],
|
||||
s[ 92], s[ 84], s[ 76], s[ 68] );
|
||||
d5[1] = _mm256_set_epi32( s[125], s[117], s[109], s[101],
|
||||
s[ 93], s[ 85], s[ 77], s[ 69] );
|
||||
d6[1] = _mm256_set_epi32( s[126], s[118], s[110], s[102],
|
||||
s[ 94], s[ 86], s[ 78], s[ 70] );
|
||||
d7[1] = _mm256_set_epi32( s[127], s[119], s[111], s[103],
|
||||
s[ 95], s[ 87], s[ 79], s[ 71] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
// null change for overrun space, vector indexing doesn't work for
|
||||
// 32 bit data
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
uint32_t *d = ((uint32_t*)d0) + 8;
|
||||
d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[152], s[144], s[136], s[128] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[153], s[145], s[137], s[129] );
|
||||
d = ((uint32_t*)d2) + 8;
|
||||
d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[154], s[146], s[138], s[130]);
|
||||
d = ((uint32_t*)d3) + 8;
|
||||
d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[155], s[147], s[139], s[131] );
|
||||
d = ((uint32_t*)d4) + 8;
|
||||
d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[156], s[148], s[140], s[132] );
|
||||
d = ((uint32_t*)d5) + 8;
|
||||
d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[157], s[149], s[141], s[133] );
|
||||
d = ((uint32_t*)d6) + 8;
|
||||
d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[158], s[150], s[142], s[134] );
|
||||
d = ((uint32_t*)d7) + 8;
|
||||
d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[159], s[151], s[143], s[135] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi32( s[184], s[176], s[168], s[160],
|
||||
s[152], s[144], s[136], s[128] );
|
||||
d1[2] = _mm256_set_epi32( s[185], s[177], s[169], s[161],
|
||||
s[153], s[145], s[137], s[129] );
|
||||
d2[2] = _mm256_set_epi32( s[186], s[178], s[170], s[162],
|
||||
s[154], s[146], s[138], s[130] );
|
||||
d3[2] = _mm256_set_epi32( s[187], s[179], s[171], s[163],
|
||||
s[155], s[147], s[139], s[131] );
|
||||
d4[2] = _mm256_set_epi32( s[188], s[180], s[172], s[164],
|
||||
s[156], s[148], s[140], s[132] );
|
||||
d5[2] = _mm256_set_epi32( s[189], s[181], s[173], s[165],
|
||||
s[157], s[149], s[141], s[133] );
|
||||
d6[2] = _mm256_set_epi32( s[190], s[182], s[174], s[166],
|
||||
s[158], s[150], s[142], s[134] );
|
||||
d7[2] = _mm256_set_epi32( s[191], s[183], s[175], s[167],
|
||||
s[159], s[151], s[143], s[135] );
|
||||
|
||||
if ( bit_len <= 768 ) return;
|
||||
|
||||
d0[3] = _mm256_set_epi32( s[248], s[240], s[232], s[224],
|
||||
s[216], s[208], s[200], s[192] );
|
||||
d1[3] = _mm256_set_epi32( s[249], s[241], s[233], s[225],
|
||||
s[217], s[209], s[201], s[193] );
|
||||
d2[3] = _mm256_set_epi32( s[250], s[242], s[234], s[226],
|
||||
s[218], s[210], s[202], s[194] );
|
||||
d3[3] = _mm256_set_epi32( s[251], s[243], s[235], s[227],
|
||||
s[219], s[211], s[203], s[195] );
|
||||
d4[3] = _mm256_set_epi32( s[252], s[244], s[236], s[228],
|
||||
s[220], s[212], s[204], s[196] );
|
||||
d5[3] = _mm256_set_epi32( s[253], s[245], s[237], s[229],
|
||||
s[221], s[213], s[205], s[197] );
|
||||
d6[3] = _mm256_set_epi32( s[254], s[246], s[238], s[230],
|
||||
s[222], s[214], s[206], s[198] );
|
||||
d7[3] = _mm256_set_epi32( s[255], s[247], s[239], s[231],
|
||||
s[223], s[215], s[207], s[199] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Deinterleave 8 arrays into indivdual buffers for scalar processing
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
|
||||
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
|
||||
{
|
||||
uint32_t *s = src;
|
||||
for ( int i = 0; i < bit_len>>5; i++, s += 8 )
|
||||
{
|
||||
*(dst0+i) = *( s );
|
||||
*(dst1+i) = *( s + 1 );
|
||||
*(dst2+i) = *( s + 2 );
|
||||
*(dst3+i) = *( s + 3 );
|
||||
*(dst4+i) = *( s + 4 );
|
||||
*(dst5+i) = *( s + 5 );
|
||||
*(dst6+i) = *( s + 6 );
|
||||
*(dst7+i) = *( s + 7 );
|
||||
}
|
||||
}
|
||||
|
||||
// Convert from 4x32 AVX interleaving to 4x64 AVX2.
|
||||
// Can't do it in place
|
||||
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
|
||||
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
|
||||
|
||||
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
|
||||
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
|
||||
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
|
||||
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// likely of no use.
|
||||
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
|
||||
// bit_len must be multiple of 64
|
||||
// broken
|
||||
static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
|
||||
int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i += 8 )
|
||||
{
|
||||
*( d + i ) = *( s + i ); // 0 <- 0 8 <- 8
|
||||
*( d + i + 1 ) = *( s + i + 4 ); // 1 <- 4 9 <- 12
|
||||
*( d + i + 2 ) = *( s + i + 1 ); // 2 <- 1 10 <- 9
|
||||
*( d + i + 3 ) = *( s + i + 5 ); // 3 <- 5 11 <- 13
|
||||
*( d + i + 4 ) = *( s + i + 2 ); // 4 <- 2 12 <- 10
|
||||
*( d + i + 5 ) = *( s + i + 6 ); // 5 <- 6 13 <- 14
|
||||
*( d + i + 6 ) = *( s + i + 3 ); // 6 <- 3 14 <- 11
|
||||
*( d + i + 7 ) = *( s + i + 7 ); // 7 <- 7 15 <- 15
|
||||
}
|
||||
}
|
||||
|
||||
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
|
||||
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
|
||||
|
||||
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
|
||||
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
|
||||
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
|
||||
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1,
|
||||
int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
|
||||
d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] );
|
||||
d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] );
|
||||
d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] );
|
||||
|
||||
d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] );
|
||||
d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src,
|
||||
int bit_len )
|
||||
{
|
||||
uint64_t *s = (uint64_t*)src;
|
||||
__m256i *d0 = (__m256i*)dst0;
|
||||
__m256i *d1 = (__m256i*)dst1;
|
||||
|
||||
d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] );
|
||||
d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] );
|
||||
d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] );
|
||||
d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] );
|
||||
d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] );
|
||||
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// not used
|
||||
static inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i +=8 )
|
||||
{
|
||||
*( d + i ) = *( s + i );
|
||||
*( d + i + 1 ) = *( s + i + 2 );
|
||||
*( d + i + 2 ) = *( s + i + 4 );
|
||||
*( d + i + 3 ) = *( s + i + 6 );
|
||||
*( d + i + 4 ) = *( s + i + 1 );
|
||||
*( d + i + 5 ) = *( s + i + 3 );
|
||||
*( d + i + 6 ) = *( s + i + 5 );
|
||||
*( d + i + 7 ) = *( s + i + 7 );
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // AVXDEFS_H__
|
||||
#endif // AVXDEFS_H__
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.8.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.8.6.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.6.1'
|
||||
PACKAGE_VERSION='3.8.8'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.8'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.8.6.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.8.8 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.6.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.8:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.8.6.1
|
||||
cpuminer-opt configure 3.8.8
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.8.6.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.8.8, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.8.6.1'
|
||||
VERSION='3.8.8'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.8.6.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.8.8, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.8.6.1
|
||||
cpuminer-opt config.status 3.8.8
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.8.6.1])
|
||||
AC_INIT([cpuminer-opt], [3.8.8])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
119
cpu-miner.c
119
cpu-miner.c
@@ -2999,45 +2999,37 @@ static void show_credits()
|
||||
bool check_cpu_capability ()
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
// there is no CPU related feature specific to 4way, just AVX2 and AES
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx1();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
// no need to check if sw has sse2,
|
||||
// the code won't compile without it.
|
||||
// bool sw_has_sse2 = false;
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_sha = false;
|
||||
// bool sw_has_4way = false;
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx1();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_avx512 = has_avx512f();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_avx512 = false;
|
||||
bool sw_has_sha = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
// bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features );
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
bool use_avx;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
// bool use_4way;
|
||||
bool use_none;
|
||||
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
// #ifdef __SSE2__
|
||||
// sw_has_sse2 = true;
|
||||
// #endif
|
||||
#ifdef __SSE4_2__
|
||||
sw_has_sse42 = true;
|
||||
#endif
|
||||
@@ -3047,12 +3039,12 @@ bool check_cpu_capability ()
|
||||
#ifdef __AVX2__
|
||||
sw_has_avx2 = true;
|
||||
#endif
|
||||
#ifdef __AVX512F__
|
||||
sw_has_avx512 = true;
|
||||
#endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#endif
|
||||
// #ifdef HASH_4WAY
|
||||
// sw_has_4way = true;
|
||||
// #endif
|
||||
|
||||
#if !((__AES__) || (__SSE2__))
|
||||
printf("Neither __AES__ nor __SSE2__ defined.\n");
|
||||
@@ -3072,33 +3064,33 @@ bool check_cpu_capability ()
|
||||
#endif
|
||||
|
||||
printf("CPU features:");
|
||||
if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( cpu_has_avx ) printf( " AVX" );
|
||||
if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( cpu_has_avx ) printf( " AVX" );
|
||||
if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
|
||||
printf(".\nSW features: SSE2");
|
||||
if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( sw_has_avx ) printf( " AVX" );
|
||||
if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
// if ( sw_has_4way ) printf( " 4WAY" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( sw_has_avx ) printf( " AVX" );
|
||||
if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
|
||||
printf(".\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( algo_has_avx ) printf( " AVX" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
// if ( algo_has_4way ) printf( " 4WAY" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
printf(".\n");
|
||||
|
||||
@@ -3118,11 +3110,6 @@ bool check_cpu_capability ()
|
||||
printf( "The SW build requires a CPU with SSE4.2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_avx && !cpu_has_avx )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AVX!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_aes && !cpu_has_aes )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES!\n" );
|
||||
@@ -3135,13 +3122,13 @@ bool check_cpu_capability ()
|
||||
}
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx || use_avx2 ||
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
|
||||
use_sha );
|
||||
|
||||
// Display best options
|
||||
@@ -3149,12 +3136,12 @@ bool check_cpu_capability ()
|
||||
if ( use_none ) printf( " no optimizations" );
|
||||
else
|
||||
{
|
||||
if ( use_aes ) printf( " AES" );
|
||||
if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_avx ) printf( " AVX" );
|
||||
if ( use_aes ) printf( " AES" );
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( ".\n\n" );
|
||||
|
||||
|
1372
interleave.h
Normal file
1372
interleave.h
Normal file
File diff suppressed because it is too large
Load Diff
155
miner.h
155
miner.h
@@ -333,6 +333,7 @@ bool has_sha();
|
||||
bool has_aes_ni();
|
||||
bool has_avx1();
|
||||
bool has_avx2();
|
||||
bool has_avx512f();
|
||||
bool has_sse2();
|
||||
bool has_xop();
|
||||
bool has_fma3();
|
||||
@@ -485,8 +486,9 @@ enum algos {
|
||||
ALGO_ALLIUM,
|
||||
ALGO_ANIME,
|
||||
ALGO_ARGON2,
|
||||
ALGO_ARGON2DCRDS,
|
||||
ALGO_ARGON2DDYN,
|
||||
ALGO_ARGON2D250,
|
||||
ALGO_ARGON2D500,
|
||||
ALGO_ARGON2D4096,
|
||||
ALGO_AXIOM,
|
||||
ALGO_BASTION,
|
||||
ALGO_BLAKE,
|
||||
@@ -496,7 +498,8 @@ enum algos {
|
||||
ALGO_BMW,
|
||||
ALGO_C11,
|
||||
ALGO_CRYPTOLIGHT,
|
||||
ALGO_CRYPTONIGHT,
|
||||
ALGO_CRYPTONIGHT,
|
||||
ALGO_CRYPTONIGHTV7,
|
||||
ALGO_DECRED,
|
||||
ALGO_DEEP,
|
||||
ALGO_DMD_GR,
|
||||
@@ -565,8 +568,9 @@ static const char* const algo_names[] = {
|
||||
"allium",
|
||||
"anime",
|
||||
"argon2",
|
||||
"argon2d-crds",
|
||||
"argon2d-dyn",
|
||||
"argon2d250",
|
||||
"argon2d500",
|
||||
"argon2d4096",
|
||||
"axiom",
|
||||
"bastion",
|
||||
"blake",
|
||||
@@ -577,6 +581,7 @@ static const char* const algo_names[] = {
|
||||
"c11",
|
||||
"cryptolight",
|
||||
"cryptonight",
|
||||
"cryptonightv7",
|
||||
"decred",
|
||||
"deep",
|
||||
"dmd-gr",
|
||||
@@ -701,82 +706,84 @@ static char const usage[] = "\
|
||||
Usage: " PACKAGE_NAME " [OPTIONS]\n\
|
||||
Options:\n\
|
||||
-a, --algo=ALGO specify the algorithm to use\n\
|
||||
allium Garlicoin (GRLC)\n\
|
||||
anime Animecoin (ANI)\n\
|
||||
argon2 Argon2 Coin (AR2)\n\
|
||||
argon2d-crds Credits (CRDS)\n\
|
||||
argon2d-dyn Dynamic (DYN)\n\
|
||||
axiom Shabal-256 MemoHash\n\
|
||||
allium Garlicoin (GRLC)\n\
|
||||
anime Animecoin (ANI)\n\
|
||||
argon2 Argon2 Coin (AR2)\n\
|
||||
argon2d250 argon2d-crds, Credits (CRDS)\n\
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)\n\
|
||||
argon2d4096 argon2d-uis, Unitus (UIS)\n\
|
||||
axiom Shabal-256 MemoHash\n\
|
||||
bastion\n\
|
||||
blake blake256r14 (SFR)\n\
|
||||
blakecoin blake256r8\n\
|
||||
blake2s Blake-2 S\n\
|
||||
bmw BMW 256\n\
|
||||
c11 Chaincoin\n\
|
||||
cryptolight Cryptonight-light\n\
|
||||
cryptonight cryptonote, Monero (XMR)\n\
|
||||
decred Blake256r14dcr\n\
|
||||
deep Deepcoin (DCN)\n\
|
||||
dmd-gr Diamond\n\
|
||||
drop Dropcoin\n\
|
||||
fresh Fresh\n\
|
||||
groestl Groestl coin\n\
|
||||
heavy Heavy\n\
|
||||
hmq1725 Espers\n\
|
||||
hodl Hodlcoin\n\
|
||||
jha jackppot (Jackpotcoin)\n\
|
||||
keccak Maxcoin\n\
|
||||
keccakc Creative Coin\n\
|
||||
lbry LBC, LBRY Credits\n\
|
||||
luffa Luffa\n\
|
||||
lyra2h Hppcoin\n\
|
||||
lyra2re lyra2\n\
|
||||
lyra2rev2 lyrav2, Vertcoin\n\
|
||||
lyra2z Zcoin (XZC)\n\
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)\n\
|
||||
m7m Magi (XMG)\n\
|
||||
myr-gr Myriad-Groestl\n\
|
||||
neoscrypt NeoScrypt(128, 2, 1)\n\
|
||||
nist5 Nist5\n\
|
||||
pentablake 5 x blake512\n\
|
||||
phi1612 phi, LUX coin\n\
|
||||
pluck Pluck:128 (Supcoin)\n\
|
||||
blake blake256r14 (SFR)\n\
|
||||
blakecoin blake256r8\n\
|
||||
blake2s Blake-2 S\n\
|
||||
bmw BMW 256\n\
|
||||
c11 Chaincoin\n\
|
||||
cryptolight Cryptonight-light\n\
|
||||
cryptonight Cryptonote legacy\n\
|
||||
cryptonightv7 variant 7, Monero (XMR)\n\
|
||||
decred Blake256r14dcr\n\
|
||||
deep Deepcoin (DCN)\n\
|
||||
dmd-gr Diamond\n\
|
||||
drop Dropcoin\n\
|
||||
fresh Fresh\n\
|
||||
groestl Groestl coin\n\
|
||||
heavy Heavy\n\
|
||||
hmq1725 Espers\n\
|
||||
hodl Hodlcoin\n\
|
||||
jha jackppot (Jackpotcoin)\n\
|
||||
keccak Maxcoin\n\
|
||||
keccakc Creative Coin\n\
|
||||
lbry LBC, LBRY Credits\n\
|
||||
luffa Luffa\n\
|
||||
lyra2h Hppcoin\n\
|
||||
lyra2re lyra2\n\
|
||||
lyra2rev2 lyrav2, Vertcoin\n\
|
||||
lyra2z Zcoin (XZC)\n\
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)\n\
|
||||
m7m Magi (XMG)\n\
|
||||
myr-gr Myriad-Groestl\n\
|
||||
neoscrypt NeoScrypt(128, 2, 1)\n\
|
||||
nist5 Nist5\n\
|
||||
pentablake 5 x blake512\n\
|
||||
phi1612 phi, LUX coin\n\
|
||||
pluck Pluck:128 (Supcoin)\n\
|
||||
polytimos\n\
|
||||
quark Quark\n\
|
||||
qubit Qubit\n\
|
||||
scrypt scrypt(1024, 1, 1) (default)\n\
|
||||
scrypt:N scrypt(N, 1, 1)\n\
|
||||
quark Quark\n\
|
||||
qubit Qubit\n\
|
||||
scrypt scrypt(1024, 1, 1) (default)\n\
|
||||
scrypt:N scrypt(N, 1, 1)\n\
|
||||
scryptjane:nf\n\
|
||||
sha256d Double SHA-256\n\
|
||||
sha256t Triple SHA-256, Onecoin (OC)\n\
|
||||
shavite3 Shavite3\n\
|
||||
skein Skein+Sha (Skeincoin)\n\
|
||||
skein2 Double Skein (Woodcoin)\n\
|
||||
skunk Signatum (SIGT)\n\
|
||||
timetravel timeravel8, Machinecoin (MAC)\n\
|
||||
timetravel10 Bitcore (BTX)\n\
|
||||
tribus Denarius (DNR)\n\
|
||||
vanilla blake256r8vnl (VCash)\n\
|
||||
sha256d Double SHA-256\n\
|
||||
sha256t Triple SHA-256, Onecoin (OC)\n\
|
||||
shavite3 Shavite3\n\
|
||||
skein Skein+Sha (Skeincoin)\n\
|
||||
skein2 Double Skein (Woodcoin)\n\
|
||||
skunk Signatum (SIGT)\n\
|
||||
timetravel timeravel8, Machinecoin (MAC)\n\
|
||||
timetravel10 Bitcore (BTX)\n\
|
||||
tribus Denarius (DNR)\n\
|
||||
vanilla blake256r8vnl (VCash)\n\
|
||||
veltor\n\
|
||||
whirlpool\n\
|
||||
whirlpoolx\n\
|
||||
x11 Dash\n\
|
||||
x11evo Revolvercoin (XRE)\n\
|
||||
x11gost sib (SibCoin)\n\
|
||||
x12 Galaxie Cash (GCH)\n\
|
||||
x13 X13\n\
|
||||
x13sm3 hsr (Hshare)\n\
|
||||
x14 X14\n\
|
||||
x15 X15\n\
|
||||
x16r Ravencoin (RVN)\n\
|
||||
x16s Pigeoncoin (PGN)\n\
|
||||
x11 Dash\n\
|
||||
x11evo Revolvercoin (XRE)\n\
|
||||
x11gost sib (SibCoin)\n\
|
||||
x12 Galaxie Cash (GCH)\n\
|
||||
x13 X13\n\
|
||||
x13sm3 hsr (Hshare)\n\
|
||||
x14 X14\n\
|
||||
x15 X15\n\
|
||||
x16r Ravencoin (RVN)\n\
|
||||
x16s Pigeoncoin (PGN)\n\
|
||||
x17\n\
|
||||
xevan Bitsend (BSD)\n\
|
||||
yescrypt Globlboost-Y (BSTY)\n\
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)\n\
|
||||
yescryptr32 WAVI\n\
|
||||
zr5 Ziftr\n\
|
||||
xevan Bitsend (BSD)\n\
|
||||
yescrypt Globlboost-Y (BSTY)\n\
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)\n\
|
||||
yescryptr32 WAVI\n\
|
||||
zr5 Ziftr\n\
|
||||
-o, --url=URL URL of mining server\n\
|
||||
-O, --userpass=U:P username:password pair for mining server\n\
|
||||
-u, --user=USERNAME username for mining server\n\
|
||||
|
16
sysinfos.c
16
sysinfos.c
@@ -274,6 +274,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
|
||||
#define SSE2_Flag (1<<26)
|
||||
|
||||
#define AVX2_Flag (1<< 5) // ADV EBX
|
||||
#define AVX512F_Flag (1<<16)
|
||||
#define SHA_Flag (1<<29)
|
||||
|
||||
// Use this to detect presence of feature
|
||||
@@ -350,6 +351,21 @@ static inline bool has_avx2_()
|
||||
|
||||
bool has_avx2() { return has_avx2_(); }
|
||||
|
||||
static inline bool has_avx512f_()
|
||||
{
|
||||
#ifdef __arm__
|
||||
return false;
|
||||
#else
|
||||
int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, cpu_info );
|
||||
return cpu_info[ EBX_Reg ] & AVX512F_Flag;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool has_avx512f() { return has_avx512f_(); }
|
||||
|
||||
|
||||
// AMD only
|
||||
static inline bool has_xop_()
|
||||
{
|
||||
#ifdef __arm__
|
||||
|
@@ -51,11 +51,13 @@ rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-avx.exe
|
||||
mv cpuminer.exe release/cpuminer-avx.exe
|
||||
|
||||
# -march=westmere is supported in gcc5
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
|
||||
CFLAGS="-O3 -march=westmere -Wall" ./configure $F
|
||||
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
|
Reference in New Issue
Block a user