mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
9b905fccc8 | ||
![]() |
92b3733925 | ||
![]() |
19cc88d102 | ||
![]() |
a053690170 | ||
![]() |
3c5e8921b7 | ||
![]() |
f3333b0070 | ||
![]() |
902ec046dd | ||
![]() |
d0b4941321 |
@@ -129,7 +129,7 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/allium.c \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo//m7m/m7m.c \
|
||||
algo/m7m/m7m.c \
|
||||
algo/m7m/magimath.cpp \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
@@ -163,6 +163,8 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha256-hash-opt.c \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha2.c \
|
||||
@@ -192,6 +194,11 @@ cpuminer_SOURCES = \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/swifftx/swifftx.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/verthash/verthash-gate.c \
|
||||
algo/verthash/Verthash.c \
|
||||
algo/verthash/fopen_utf8.c \
|
||||
algo/verthash/tiny_sha3/sha3.c \
|
||||
algo/verthash/tiny_sha3/sha3-4way.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
|
@@ -89,7 +89,7 @@ Supported Algorithms
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2
|
||||
lyra2rev3 lyrav2v3, Vertcoin
|
||||
lyra2rev3 lyrav2v3
|
||||
lyra2z
|
||||
lyra2z330 Lyra2 330 rows, Zoin (ZOI)
|
||||
m7m Magi (XMG)
|
||||
@@ -122,6 +122,7 @@ Supported Algorithms
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor (VLT)
|
||||
verthash Vertcoin
|
||||
whirlpool
|
||||
whirlpoolx
|
||||
x11 Dash
|
||||
@@ -134,7 +135,7 @@ Supported Algorithms
|
||||
x14 X14
|
||||
x15 X15
|
||||
x16r
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rv2
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
|
@@ -64,6 +64,11 @@ source code obtained from the author's official repository. The exact
|
||||
procedure is documented in the build instructions for Windows:
|
||||
https://github.com/JayDDee/cpuminer-opt/wiki/Compiling-from-source
|
||||
|
||||
Some DLL filess may already be installed on the system by Windows or third
|
||||
party packages. They often will work and may be used instead of the included
|
||||
file. Without a compelling reason to do so it's recommended to use the included
|
||||
files as they are packaged.
|
||||
|
||||
If you like this software feel free to donate:
|
||||
|
||||
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
|
||||
|
@@ -65,6 +65,61 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.17.1
|
||||
|
||||
Fixed Windows build for AES+SSE4.2 (Westmere), was missing AES.
|
||||
More ternary logic optimizations for AVX512, AVX512+VAES, and AVX512+AES.
|
||||
Fixed my-gr algo for VAES.
|
||||
|
||||
v3.17.0
|
||||
|
||||
AVX512 optimized using ternary logic instructions.
|
||||
Faster sha256t on all CPU architectures: AVX512 +30%, SHA +30%, AVX2 +9%.
|
||||
Use SHA on supported CPUs to produce merkle hash.
|
||||
Fixed byte order in Extranonce2 log & replaced Block height with Job ID.
|
||||
|
||||
v3.16.5
|
||||
|
||||
#329: Fixed GBT incorrect target diff in stats, second attempt.
|
||||
Fixed formatting error in share result log when --no-color option is used.
|
||||
|
||||
v3.16.4
|
||||
|
||||
Faster sha512 and sha256 when not using SHA CPU extension.
|
||||
#329: Fixed GBT incorrect target diff in stats.
|
||||
|
||||
v3.16.3
|
||||
|
||||
#313 Fix compile error with GCC 11.
|
||||
Incremental improvements to verthash.
|
||||
|
||||
v3.16.2
|
||||
|
||||
Verthash: midstate prehash optimization for all architectures.
|
||||
Verthash: AVX2 optimization.
|
||||
GBT: added support for Bech32 addresses.
|
||||
Linux: added CPU frequency to benchmark log.
|
||||
Fixed integer overflow in time calculations.
|
||||
|
||||
v3.16.1
|
||||
|
||||
New options for verthash:
|
||||
--data-file to specify the name, and optionally the path, of the verthash
|
||||
data file, default is "verthash.dat" in the current directory.
|
||||
--verify to perform the data file integrity check at startup, default is
|
||||
not to verify data file integrity.
|
||||
Support for creation of default verthash data file if:
|
||||
1) --data-file option is not used,
|
||||
2) no default data file is found in the current directory, and,
|
||||
3) --verify option is used.
|
||||
More detailed logs related to verthash data file.
|
||||
Small verthash performance improvement.
|
||||
Fixed detection of corrupt stats caused by networking issues.
|
||||
|
||||
v3.16.0
|
||||
|
||||
Added verthash algo.
|
||||
|
||||
v3.15.7
|
||||
|
||||
Added accepted/stale/rejected percentage to summary log report.
|
||||
@@ -84,7 +139,6 @@ RPC getmininginfo method.
|
||||
v3.15.5
|
||||
|
||||
Fix stratum jobs lost if 2 jobs received in less than one second.
|
||||
|
||||
|
||||
v3.15.4
|
||||
|
||||
|
191
algo-gate-api.c
191
algo-gate-api.c
@@ -15,8 +15,6 @@
|
||||
#include <stdbool.h>
|
||||
#include <memory.h>
|
||||
#include <unistd.h>
|
||||
#include <openssl/sha.h>
|
||||
//#include "miner.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
// Define null and standard functions.
|
||||
@@ -279,9 +277,11 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
// called by each thread that uses the gate
|
||||
// Called once by main
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
bool rc = false;
|
||||
|
||||
if ( NULL == gate )
|
||||
{
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
|
||||
@@ -290,108 +290,108 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
init_algo_gate( gate );
|
||||
|
||||
switch (algo)
|
||||
switch ( algo )
|
||||
{
|
||||
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: register_c11_algo ( gate ); break;
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: register_veltor_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13BCD: register_x13bcd_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_05_algo ( gate ); break;
|
||||
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: rc = register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
||||
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: rc = register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: rc = register_blake2s_algo ( gate ); break;
|
||||
case ALGO_BLAKECOIN: rc = register_blakecoin_algo ( gate ); break;
|
||||
case ALGO_BMW512: rc = register_bmw512_algo ( gate ); break;
|
||||
case ALGO_C11: rc = register_c11_algo ( gate ); break;
|
||||
case ALGO_DECRED: rc = register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: rc = register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: rc = register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: rc = register_lbry_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: rc = register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: rc = register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: rc = register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV3: rc = register_lyra2rev3_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: rc = register_lyra2z_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z330: rc = register_lyra2z330_algo ( gate ); break;
|
||||
case ALGO_M7M: rc = register_m7m_algo ( gate ); break;
|
||||
case ALGO_MINOTAUR: rc = register_minotaur_algo ( gate ); break;
|
||||
case ALGO_MYR_GR: rc = register_myriad_algo ( gate ); break;
|
||||
case ALGO_NEOSCRYPT: rc = register_neoscrypt_algo ( gate ); break;
|
||||
case ALGO_NIST5: rc = register_nist5_algo ( gate ); break;
|
||||
case ALGO_PENTABLAKE: rc = register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: rc = register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: rc = register_phi2_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: rc = register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: rc = register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: rc = register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||
case ALGO_SONOA: rc = register_sonoa_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL: rc = register_timetravel_algo ( gate ); break;
|
||||
case ALGO_TIMETRAVEL10: rc = register_timetravel10_algo ( gate ); break;
|
||||
case ALGO_TRIBUS: rc = register_tribus_algo ( gate ); break;
|
||||
case ALGO_VANILLA: rc = register_vanilla_algo ( gate ); break;
|
||||
case ALGO_VELTOR: rc = register_veltor_algo ( gate ); break;
|
||||
case ALGO_VERTHASH: rc = register_verthash_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOL: rc = register_whirlpool_algo ( gate ); break;
|
||||
case ALGO_WHIRLPOOLX: rc = register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: rc = register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: rc = register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: rc = register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X12: rc = register_x12_algo ( gate ); break;
|
||||
case ALGO_X13: rc = register_x13_algo ( gate ); break;
|
||||
case ALGO_X13BCD: rc = register_x13bcd_algo ( gate ); break;
|
||||
case ALGO_X13SM3: rc = register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: rc = register_x14_algo ( gate ); break;
|
||||
case ALGO_X15: rc = register_x15_algo ( gate ); break;
|
||||
case ALGO_X16R: rc = register_x16r_algo ( gate ); break;
|
||||
case ALGO_X16RV2: rc = register_x16rv2_algo ( gate ); break;
|
||||
case ALGO_X16RT: rc = register_x16rt_algo ( gate ); break;
|
||||
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
||||
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break;
|
||||
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_05_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break;
|
||||
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_05_algo( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break;
|
||||
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_05_algo( gate ); break;
|
||||
case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break;
|
||||
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;
|
||||
case ALGO_ZR5: rc = register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
applog(LOG_ERR,"BUG: unregistered algorithm %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
} // switch
|
||||
|
||||
// ensure required functions were defined.
|
||||
if ( gate->scanhash == (void*)&null_scanhash )
|
||||
if ( !rc )
|
||||
{
|
||||
applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
|
||||
applog(LOG_ERR, "FAIL: %s algorithm failed to initialize\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -433,7 +433,6 @@ const char* const algo_alias_map[][2] =
|
||||
{ "flax", "c11" },
|
||||
{ "hsr", "x13sm3" },
|
||||
{ "jackpot", "jha" },
|
||||
{ "jane", "scryptjane" },
|
||||
{ "lyra2", "lyra2re" },
|
||||
{ "lyra2v2", "lyra2rev2" },
|
||||
{ "lyra2v3", "lyra2rev3" },
|
||||
|
@@ -114,15 +114,15 @@ typedef struct
|
||||
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
||||
// is used a custom target hash function must be registered, with a custom
|
||||
// scanhash the target hash function can be called directly and doesn't need
|
||||
// to be registered in the gate.
|
||||
// to be registered with the gate.
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
int ( *hash ) ( void*, const void*, int );
|
||||
|
||||
//optional, safe to use default in most cases
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
// Called once by each miner thread to allocate thread local buffers and
|
||||
// other initialization specific to miner threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
@@ -150,7 +150,7 @@ void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
// Big endian or little endian
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
@@ -260,7 +260,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
#endif
|
||||
|
||||
// displays warning
|
||||
int null_hash ();
|
||||
int null_hash();
|
||||
|
||||
// optional safe targets, default listed first unless noted.
|
||||
|
||||
@@ -281,7 +281,7 @@ void std_be_build_stratum_request( char *req, struct work *work );
|
||||
|
||||
char* std_malloc_txs_request( struct work *work );
|
||||
|
||||
// Default is do_nothing (assumed LE)
|
||||
// Default is do_nothing, little endian is assumed
|
||||
void set_work_data_big_endian( struct work *work );
|
||||
|
||||
double std_calc_network_diff( struct work *work );
|
||||
|
@@ -180,6 +180,7 @@ void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_hash_le80( void *hash, const void *data );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
@@ -669,14 +669,14 @@ do { \
|
||||
ROUND_S_8WAY(2); \
|
||||
ROUND_S_8WAY(3); \
|
||||
} \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), H7 ); \
|
||||
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -808,14 +808,14 @@ do { \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
} \
|
||||
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
||||
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
||||
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
||||
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
||||
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
||||
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
||||
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
||||
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
||||
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
@@ -122,14 +122,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
|
||||
}
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[0], v[0] ), v[ 8] );
|
||||
ctx->h[1] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[1], v[1] ), v[ 9] );
|
||||
ctx->h[2] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[2], v[2] ), v[10] );
|
||||
ctx->h[3] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[3], v[3] ), v[11] );
|
||||
ctx->h[4] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[4], v[4] ), v[12] );
|
||||
ctx->h[5] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[5], v[5] ), v[13] );
|
||||
ctx->h[6] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[6], v[6] ), v[14] );
|
||||
ctx->h[7] = _mm512_xor_si512( _mm512_xor_si512( ctx->h[7], v[7] ), v[15] );
|
||||
ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] );
|
||||
ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] );
|
||||
ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] );
|
||||
ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] );
|
||||
ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] );
|
||||
ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] );
|
||||
ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] );
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN(128) typedef struct {
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m512i b[16]; // input buffer
|
||||
__m512i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
@@ -35,7 +35,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// state context
|
||||
ALIGN(128) typedef struct {
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m256i b[16]; // input buffer
|
||||
__m256i h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
@@ -27,8 +26,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -368,7 +368,7 @@ do { \
|
||||
ROUND8W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm256_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G8W
|
||||
#undef ROUND8W
|
||||
@@ -566,7 +566,7 @@ do { \
|
||||
ROUND16W( 9 );
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( _mm512_xor_si512( S->h[i], v[i] ), v[i + 8] );
|
||||
S->h[i] = mm512_xor3( S->h[i], v[i], v[i + 8] );
|
||||
|
||||
#undef G16W
|
||||
#undef ROUND16W
|
||||
|
@@ -60,7 +60,7 @@ typedef struct __blake2s_nway_param
|
||||
} blake2s_nway_param;
|
||||
#pragma pack(pop)
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_4way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||
{
|
||||
__m128i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
|
||||
@@ -80,7 +80,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_8way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
|
||||
@@ -101,7 +101,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
ALIGN( 128 ) typedef struct __blake2s_16way_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
|
||||
|
@@ -293,10 +293,6 @@ static const sph_u64 CB[16] = {
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
@@ -310,10 +306,6 @@ static const sph_u64 CB[16] = {
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
@@ -348,7 +340,6 @@ static const sph_u64 CB[16] = {
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY( buf ) do \
|
||||
@@ -366,10 +357,10 @@ static const sph_u64 CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
||||
V8 = m512_const1_64( CB0 ); \
|
||||
V9 = m512_const1_64( CB1 ); \
|
||||
VA = m512_const1_64( CB2 ); \
|
||||
VB = m512_const1_64( CB3 ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB4 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
@@ -414,14 +405,14 @@ static const sph_u64 CB[16] = {
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = mm512_xor3( V8, V0, H0 ); \
|
||||
H1 = mm512_xor3( V9, V1, H1 ); \
|
||||
H2 = mm512_xor3( VA, V2, H2 ); \
|
||||
H3 = mm512_xor3( VB, V3, H3 ); \
|
||||
H4 = mm512_xor3( VC, V4, H4 ); \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
@@ -440,10 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
|
||||
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
|
||||
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
|
||||
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
|
||||
V8 = m512_const1_64( CB0 );
|
||||
V9 = m512_const1_64( CB1 );
|
||||
VA = m512_const1_64( CB2 );
|
||||
VB = m512_const1_64( CB3 );
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
m512_const1_64( CB4 ) );
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
@@ -492,19 +483,18 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
ROUND_B_8WAY(4);
|
||||
ROUND_B_8WAY(5);
|
||||
|
||||
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
sc->H[0] = mm512_xor3( V8, V0, sc->H[0] );
|
||||
sc->H[1] = mm512_xor3( V9, V1, sc->H[1] );
|
||||
sc->H[2] = mm512_xor3( VA, V2, sc->H[2] );
|
||||
sc->H[3] = mm512_xor3( VB, V3, sc->H[3] );
|
||||
sc->H[4] = mm512_xor3( VC, V4, sc->H[4] );
|
||||
sc->H[5] = mm512_xor3( VD, V5, sc->H[5] );
|
||||
sc->H[6] = mm512_xor3( VE, V6, sc->H[6] );
|
||||
sc->H[7] = mm512_xor3( VF, V7, sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
@@ -514,11 +504,6 @@ void blake512_8way_init( blake_8way_big_context *sc )
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = zero;
|
||||
casti_m512i( sc->S, 1 ) = zero;
|
||||
casti_m512i( sc->S, 2 ) = zero;
|
||||
casti_m512i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -641,11 +626,6 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = m512_zero;
|
||||
casti_m512i( sc->S, 1 ) = m512_zero;
|
||||
casti_m512i( sc->S, 2 ) = m512_zero;
|
||||
casti_m512i( sc->S, 3 ) = m512_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
@@ -740,7 +720,6 @@ blake512_8way_close(void *cc, void *dst)
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
@@ -758,10 +737,10 @@ blake512_8way_close(void *cc, void *dst)
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \
|
||||
V8 = m256_const1_64( CB0 ); \
|
||||
V9 = m256_const1_64( CB1 ); \
|
||||
VA = m256_const1_64( CB2 ); \
|
||||
VB = m256_const1_64( CB3 ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
m256_const1_64( CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
@@ -804,14 +783,14 @@ blake512_8way_close(void *cc, void *dst)
|
||||
ROUND_B_4WAY(3); \
|
||||
ROUND_B_4WAY(4); \
|
||||
ROUND_B_4WAY(5); \
|
||||
H0 = mm256_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm256_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm256_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm256_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm256_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm256_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm256_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
H0 = mm256_xor3( V8, V0, H0 ); \
|
||||
H1 = mm256_xor3( V9, V1, H1 ); \
|
||||
H2 = mm256_xor3( VA, V2, H2 ); \
|
||||
H3 = mm256_xor3( VB, V3, H3 ); \
|
||||
H4 = mm256_xor3( VC, V4, H4 ); \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -831,10 +810,10 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
|
||||
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
|
||||
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
|
||||
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
|
||||
V8 = m256_const1_64( CB0 );
|
||||
V9 = m256_const1_64( CB1 );
|
||||
VA = m256_const1_64( CB2 );
|
||||
VB = m256_const1_64( CB3 );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
@@ -880,19 +859,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
ROUND_B_4WAY(4);
|
||||
ROUND_B_4WAY(5);
|
||||
|
||||
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
sc->H[0] = mm256_xor3( V8, V0, sc->H[0] );
|
||||
sc->H[1] = mm256_xor3( V9, V1, sc->H[1] );
|
||||
sc->H[2] = mm256_xor3( VA, V2, sc->H[2] );
|
||||
sc->H[3] = mm256_xor3( VB, V3, sc->H[3] );
|
||||
sc->H[4] = mm256_xor3( VC, V4, sc->H[4] );
|
||||
sc->H[5] = mm256_xor3( VD, V5, sc->H[5] );
|
||||
sc->H[6] = mm256_xor3( VE, V6, sc->H[6] );
|
||||
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
@@ -902,11 +880,6 @@ void blake512_4way_init( blake_4way_big_context *sc )
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -1026,11 +999,6 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = m256_zero;
|
||||
casti_m256i( sc->S, 1 ) = m256_zero;
|
||||
casti_m256i( sc->S, 2 ) = m256_zero;
|
||||
casti_m256i( sc->S, 3 ) = m256_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
|
@@ -323,7 +323,7 @@ int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
|
||||
|
||||
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
|
||||
{
|
||||
blake2s_state S[1];
|
||||
blake2s_state S;
|
||||
|
||||
/* Verify parameters */
|
||||
if ( NULL == in ) return -1;
|
||||
@@ -334,15 +334,15 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
|
||||
|
||||
if( keylen > 0 )
|
||||
{
|
||||
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
|
||||
if( blake2s_init_key( &S, outlen, key, keylen ) < 0 ) return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( blake2s_init( S, outlen ) < 0 ) return -1;
|
||||
if( blake2s_init( &S, outlen ) < 0 ) return -1;
|
||||
}
|
||||
|
||||
blake2s_update( S, ( uint8_t * )in, inlen );
|
||||
blake2s_final( S, out, outlen );
|
||||
blake2s_update( &S, ( uint8_t * )in, inlen );
|
||||
blake2s_final( &S, out, outlen );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -116,7 +116,7 @@ extern "C" {
|
||||
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
|
||||
} blake2s_param;
|
||||
|
||||
ALIGN( 64 ) typedef struct __blake2s_state
|
||||
typedef struct ALIGN( 64 ) __blake2s_state
|
||||
{
|
||||
uint32_t h[8];
|
||||
uint32_t t[2];
|
||||
|
@@ -18,7 +18,7 @@
|
||||
#endif
|
||||
|
||||
// state context
|
||||
ALIGN(64) typedef struct {
|
||||
typedef ALIGN(64) struct {
|
||||
uint8_t b[128]; // input buffer
|
||||
uint64_t h[8]; // chained state
|
||||
uint64_t t[2]; // total number of bytes
|
||||
|
@@ -867,40 +867,35 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
qt[30] = expand2s8( qt, M, H, 30 );
|
||||
qt[31] = expand2s8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm256_xor_si256(
|
||||
mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm256_xor_si256( xl, _mm256_xor_si256(
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm256_xor3( mm256_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm256_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm256_xor3( mm256_xor3( xl, qt[24], qt[25] ),
|
||||
mm256_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm256_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
_mm256_add_epi32( mm256_xor3( M[m], _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ), \
|
||||
mm256_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
mm256_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm256_xor3( _mm256_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
@@ -924,88 +919,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
/*
|
||||
dH[ 0] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
|
||||
_mm256_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 7 ),
|
||||
_mm256_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 5 ),
|
||||
_mm256_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 1 ),
|
||||
_mm256_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 3 ),
|
||||
_mm256_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 6 ),
|
||||
_mm256_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 4 ),
|
||||
_mm256_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, 11 ),
|
||||
_mm256_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
*/
|
||||
}
|
||||
|
||||
static const __m256i final_s8[16] =
|
||||
@@ -1422,40 +1335,35 @@ void compress_small_16way( const __m512i *M, const __m512i H[16],
|
||||
qt[30] = expand2s16( qt, M, H, 30 );
|
||||
qt[31] = expand2s16( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi32( mm512_xor3( M[m], _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_slli_epi32( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_srli_epi32( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
|
@@ -1285,40 +1285,35 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[30] = expand2b8( qt, M, H, 30 );
|
||||
qt[31] = expand2b8( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = mm512_xor3( mm512_xor3( qt[16], qt[17], qt[18] ),
|
||||
mm512_xor3( qt[19], qt[20], qt[21] ),
|
||||
_mm512_xor_si512( qt[22], qt[23] ) );
|
||||
|
||||
xh = mm512_xor3( mm512_xor3( xl, qt[24], qt[25] ),
|
||||
mm512_xor3( qt[26], qt[27], qt[28] ),
|
||||
mm512_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi64( mm512_xor3( M[m], _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
_mm512_add_epi64( mm512_xor3( M[m], _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ), \
|
||||
mm512_xor3( xl, qt[b], qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_slli_epi64( xl, sl ), qt[b], qt[c] ) )
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
mm512_xor3( xh, qt[a], M[m] ) ), \
|
||||
mm512_xor3( _mm512_srli_epi64( xl, sr ), qt[b], qt[c] ) )
|
||||
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
|
@@ -53,6 +53,20 @@ MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x000
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
@@ -73,7 +87,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
@@ -83,7 +97,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
@@ -93,10 +107,29 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -138,7 +171,7 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
|
@@ -13,12 +13,19 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
//#define mul2mask m512_const2_64( 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
//#define lsbmask m512_const1_32( 0x01010101 )
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm512_add_epi32( k1, one ); \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], m512_zero ); \
|
||||
state[1][j] = _mm512_aesenc_epi128( state[1][j], m512_zero ); \
|
||||
state[2][j] = _mm512_aesenc_epi128( state[2][j], m512_zero ); \
|
||||
state[3][j] = _mm512_aesenc_epi128( state[3][j], m512_zero )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
@@ -44,8 +51,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 0 ][ j ] = mm512_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
@@ -55,8 +61,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 1 ][ j ] = mm512_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
@@ -66,11 +71,29 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 2 ][ j ] = mm512_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
@@ -112,6 +135,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
@@ -405,6 +429,20 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES4_2WAY( state, j ) \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
state[0][j] = _mm256_aesenc_epi128( state[0][j], m256_zero ); \
|
||||
state[1][j] = _mm256_aesenc_epi128( state[1][j], m256_zero ); \
|
||||
state[2][j] = _mm256_aesenc_epi128( state[2][j], m256_zero ); \
|
||||
state[3][j] = _mm256_aesenc_epi128( state[3][j], m256_zero )
|
||||
|
||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 ); \
|
||||
@@ -456,6 +494,25 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES4_2WAY(_state, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 0);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 1);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 2);\
|
||||
ECHO_SUBBYTES4_2WAY(_state2, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||
@@ -497,6 +554,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
*/
|
||||
|
||||
#define SAVESTATE_2WAY(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
|
@@ -124,7 +124,16 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
|
||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
@@ -133,37 +142,59 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
*/
|
||||
|
||||
#define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
|
||||
*/
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1b));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t4;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1d));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = t2;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1a));\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = _mm_xor_si128(t2, t3);\
|
||||
t2 = _mm_xor_si128(t2, t0);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = t0;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix4a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
@@ -171,59 +202,55 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
|
||||
SUBSTITUTE( r4c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
UNPACK_S0(r4c, r4a, _t3)
|
||||
|
||||
|
||||
|
||||
#define LOADCOLUMN(x, s, a)\
|
||||
block[0] = col[(base + a + 0) % s];\
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
@@ -247,14 +274,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
case 1:
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4],
|
||||
ctx->state[5], ctx->state[ 6], ctx->state[8],
|
||||
ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
|
||||
ctx->state[9], ctx->state[10], _t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7],
|
||||
ctx->state[1], ctx->state[7], ctx->state[8],
|
||||
ctx->state[6], ctx->state[0], ctx->state[6],
|
||||
ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6], ctx->state[4],
|
||||
ctx->state[10] );
|
||||
ctx->state[6], ctx->state[0], ctx->state[6],
|
||||
ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6], ctx->state[4],
|
||||
ctx->state[10] );
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
@@ -263,14 +290,14 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
case 2:
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0],
|
||||
ctx->state[ 1], ctx->state[2], ctx->state[4],
|
||||
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
|
||||
ctx->state[ 5], ctx->state[6], _t0, _t1, _t2);
|
||||
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3],
|
||||
ctx->state[9], ctx->state[3], ctx->state[4],
|
||||
ctx->state[2], ctx->state[8], ctx->state[2],
|
||||
ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0],
|
||||
ctx->state[6]);
|
||||
ctx->state[2], ctx->state[8], ctx->state[2],
|
||||
ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0],
|
||||
ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
@@ -278,44 +305,42 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
while( uBlockCount > 0 )
|
||||
{
|
||||
TIX512( pmsg, ctx->state[ 7], ctx->state[2], ctx->state[8], ctx->state[9],
|
||||
ctx->state[10], ctx->state[0], ctx->state[1], ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1], ctx->state[11],
|
||||
ctx->state[5], ctx->state[11], ctx->state[0],
|
||||
ctx->state[10], ctx->state[4], ctx->state[10],
|
||||
ctx->state[11], ctx->state[9], ctx->state[3],
|
||||
ctx->state[9], ctx->state[10], ctx->state[8],
|
||||
ctx->state[2] );
|
||||
TIX512( pmsg, ctx->state[ 7],ctx->state[2],ctx->state[8],ctx->state[9],
|
||||
ctx->state[10],ctx->state[0],ctx->state[1],ctx->state[2],
|
||||
_t0, _t1, _t2 );
|
||||
SUBROUND512_4( ctx->state[0], ctx->state[1],ctx->state[11],ctx->state[5],
|
||||
ctx->state[11],ctx->state[0],ctx->state[10],ctx->state[4],
|
||||
ctx->state[10],ctx->state[11],ctx->state[9],ctx->state[3],
|
||||
ctx->state[9],ctx->state[10],ctx->state[8],ctx->state[2] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[3], ctx->state[10], ctx->state[4], ctx->state[5],
|
||||
ctx->state[6], ctx->state[8], ctx->state[9], ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
TIX512( pmsg, ctx->state[3],ctx->state[10],ctx->state[4],ctx->state[5],
|
||||
ctx->state[6],ctx->state[8], ctx->state[9],ctx->state[10],
|
||||
_t0, _t1, _t2 );
|
||||
|
||||
SUBROUND512_4( ctx->state[8], ctx->state[9], ctx->state[7], ctx->state[1], ctx->state[7], ctx->state[8], ctx->state[6], ctx->state[0],
|
||||
ctx->state[6], ctx->state[7], ctx->state[5], ctx->state[11],
|
||||
ctx->state[5], ctx->state[6, ctx->state[4], ctx->state[10]);
|
||||
SUBROUND512_4( ctx->state[8],ctx->state[9],ctx->state[7],ctx->state[1],
|
||||
ctx->state[7],ctx->state[8],ctx->state[6],ctx->state[0],
|
||||
ctx->state[6],ctx->state[7],ctx->state[5],ctx->state[11],
|
||||
ctx->state[5],ctx->state[6],ctx->state[4],ctx->state[10] );
|
||||
|
||||
ctx->base++;
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
if( uBlockCount == 0 ) break;
|
||||
|
||||
TIX512( pmsg, ctx->state[11], ctx->state[6], ctx->state[0], ctx->state[1],
|
||||
ctx->state[2], ctx->state[4], ctx->state[5], ctx->state[6],
|
||||
_t0, _t1, _t2);
|
||||
SUBROUND512_4( ctx->state[4], ctx->state[5], ctx->state[3], ctx->state[9],
|
||||
ctx->state[3], ctx->state[4], ctx->state[2], ctx->state[8],
|
||||
ctx->state[2], ctx->state[3], ctx->state[1], ctx->state[7],
|
||||
ctx->state[1], ctx->state[2], ctx->state[0], ctx->state[6]);
|
||||
TIX512( pmsg, ctx->state[11],ctx->state[6],ctx->state[0],ctx->state[1],
|
||||
ctx->state[2], ctx->state[4],ctx->state[5],ctx->state[6],
|
||||
_t0, _t1, _t2);
|
||||
SUBROUND512_4( ctx->state[4],ctx->state[5],ctx->state[3],ctx->state[9],
|
||||
ctx->state[3],ctx->state[4],ctx->state[2],ctx->state[8],
|
||||
ctx->state[2],ctx->state[3],ctx->state[1],ctx->state[7],
|
||||
ctx->state[1],ctx->state[2],ctx->state[0],ctx->state[6]);
|
||||
|
||||
ctx->base = 0;
|
||||
pmsg += 4;
|
||||
@@ -326,8 +351,8 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
|
||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
{
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int i, base;
|
||||
__m128i r0, _t0, _t1, _t2, _t3;
|
||||
|
||||
@@ -357,7 +382,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
@@ -375,7 +400,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -390,7 +415,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -405,7 +430,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
|
||||
@@ -420,7 +445,7 @@ void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t1, _t2, _t3, _t0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
}
|
||||
|
@@ -67,11 +67,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -93,6 +91,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -189,6 +277,8 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
|
@@ -58,11 +58,9 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -82,6 +80,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -178,6 +266,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
|
@@ -96,11 +96,9 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
@@ -120,6 +118,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
a4 = _mm512_xor_si512( a4, a5 );\
|
||||
b4 = a6; \
|
||||
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||
b5 = a7; \
|
||||
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0; \
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2( a0, b0, b1 ); \
|
||||
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
|
||||
#if 0
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
@@ -215,7 +302,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
|
@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
i = mm512_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
|
||||
b0, b1, b2, b3, b4, b5, b6, b7) { \
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
b6 = a0; \
|
||||
b7 = a1; \
|
||||
a0 = _mm512_xor_si512( a0, a1 ); \
|
||||
b0 = a2; \
|
||||
a1 = _mm512_xor_si512( a1, a2 ); \
|
||||
b1 = a3; \
|
||||
TEMP2 = _mm512_xor_si512( a2, a3 ); \
|
||||
b2 = a4; \
|
||||
a3 = _mm512_xor_si512( a3, a4 ); \
|
||||
b3 = a5; \
|
||||
a4 = _mm512_xor_si512( a4, a5 );\
|
||||
b4 = a6; \
|
||||
a5 = _mm512_xor_si512( a5, a6 ); \
|
||||
b5 = a7; \
|
||||
a6 = _mm512_xor_si512( a6, a7 ); \
|
||||
a7 = _mm512_xor_si512( a7, b6 ); \
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
TEMP0 = mm512_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
TEMP1 = mm512_xor3( b1, a5, a7 ); \
|
||||
b2 = mm512_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
b0 = a0; \
|
||||
b3 = mm512_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm512_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm512_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm512_xor3( b7, a5, a3 ); \
|
||||
b5 = mm512_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
a0 = _mm512_xor_si512( a0, a3 ); \
|
||||
a1 = _mm512_xor_si512( a1, a4 ); \
|
||||
a2 = _mm512_xor_si512( TEMP2, a5 ); \
|
||||
a3 = _mm512_xor_si512( a3, a6 ); \
|
||||
a4 = _mm512_xor_si512( a4, a7 ); \
|
||||
a5 = _mm512_xor_si512( a5, b0 ); \
|
||||
a6 = _mm512_xor_si512( a6, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, TEMP2 ); \
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
|
||||
MUL2( a0, b0, b1 ); \
|
||||
a0 = _mm512_xor_si512( a0, TEMP0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
a1 = _mm512_xor_si512( a1, TEMP1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
a2 = _mm512_xor_si512( a2, b2 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
a3 = _mm512_xor_si512( a3, b3 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
a4 = _mm512_xor_si512( a4, b4 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
a5 = _mm512_xor_si512( a5, b5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
a6 = _mm512_xor_si512( a6, b6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
a7 = _mm512_xor_si512( a7, b7 ); \
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
MUL2( a0, b0, b1 ); \
|
||||
b5 = _mm512_xor_si512( b5, a0 ); \
|
||||
MUL2( a1, b0, b1 ); \
|
||||
b6 = _mm512_xor_si512( b6, a1 ); \
|
||||
MUL2( a2, b0, b1 ); \
|
||||
b7 = _mm512_xor_si512( b7, a2 ); \
|
||||
MUL2( a5, b0, b1 ); \
|
||||
b2 = _mm512_xor_si512( b2, a5 ); \
|
||||
MUL2( a6, b0, b1 ); \
|
||||
b3 = _mm512_xor_si512( b3, a6 ); \
|
||||
MUL2( a7, b0, b1 ); \
|
||||
b4 = _mm512_xor_si512( b4, a7 ); \
|
||||
MUL2( a3, b0, b1 ); \
|
||||
MUL2( a4, b0, b1 ); \
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
b0 = _mm512_xor_si512( b0, a3 ); \
|
||||
b1 = _mm512_xor_si512( b1, a4 ); \
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2_2WAY(i, j, k){\
|
||||
j = _mm256_xor_si256(j, j);\
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
j = _mm256_cmpgt_epi8( m256_zero, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
i = mm256_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
|
@@ -44,6 +44,7 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(groestl512_4way_context) );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
@@ -58,8 +59,6 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
#else
|
||||
|
||||
@@ -76,27 +75,27 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
hash4, hash5, hash6, hash7, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 512 );
|
||||
memcpy( &ctx.groestl, &myrgr_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
#endif
|
||||
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
}
|
||||
|
@@ -560,22 +560,14 @@ do { \
|
||||
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
||||
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
||||
_mm512_slli_epi64( dm, 32 ) ) ); \
|
||||
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[0] ) ) ); \
|
||||
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[1] ) ) ); \
|
||||
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[2] ) ) ); \
|
||||
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[3] ) ) ); \
|
||||
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[4] ) ) ); \
|
||||
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[5] ) ) ); \
|
||||
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[6] ) ) ); \
|
||||
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
|
||||
m512_const1_64( tp[7] ) ) ); \
|
||||
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
|
||||
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
|
||||
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
|
||||
m3 = mm512_xorand( m3, dm, m512_const1_64( tp[3] ) ); \
|
||||
m4 = mm512_xorand( m4, dm, m512_const1_64( tp[4] ) ); \
|
||||
m5 = mm512_xorand( m5, dm, m512_const1_64( tp[5] ) ); \
|
||||
m6 = mm512_xorand( m6, dm, m512_const1_64( tp[6] ) ); \
|
||||
m7 = mm512_xorand( m7, dm, m512_const1_64( tp[7] ) ); \
|
||||
tp += 8; \
|
||||
db = _mm512_srli_epi64( db, 1 ); \
|
||||
} \
|
||||
@@ -585,20 +577,13 @@ do { \
|
||||
do { \
|
||||
__m512i t; \
|
||||
t = a; \
|
||||
a = _mm512_and_si512( a, c ); \
|
||||
a = _mm512_xor_si512( a, d ); \
|
||||
c = _mm512_xor_si512( c, b ); \
|
||||
c = _mm512_xor_si512( c, a ); \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, b ); \
|
||||
a = mm512_xorand( d, a, c ); \
|
||||
c = mm512_xor3( a, b, c ); \
|
||||
b = mm512_xoror( b, d, t ); \
|
||||
t = _mm512_xor_si512( t, c ); \
|
||||
b = d; \
|
||||
d = _mm512_or_si512( d, t ); \
|
||||
d = _mm512_xor_si512( d, a ); \
|
||||
a = _mm512_and_si512( a, b ); \
|
||||
t = _mm512_xor_si512( t, a ); \
|
||||
b = _mm512_xor_si512( b, d ); \
|
||||
b = _mm512_xor_si512( b, t ); \
|
||||
d = mm512_xoror( a, b, t ); \
|
||||
t = mm512_xorand( t, a, b ); \
|
||||
b = mm512_xor3( b, d, t ); \
|
||||
a = c; \
|
||||
c = b; \
|
||||
b = d; \
|
||||
@@ -609,14 +594,12 @@ do { \
|
||||
do { \
|
||||
a = mm512_rol_32( a, 13 ); \
|
||||
c = mm512_rol_32( c, 3 ); \
|
||||
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
|
||||
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
|
||||
_mm512_slli_epi32( a, 3 ) ) ); \
|
||||
b = mm512_xor3( a, b, c ); \
|
||||
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
|
||||
b = mm512_rol_32( b, 1 ); \
|
||||
d = mm512_rol_32( d, 7 ); \
|
||||
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
|
||||
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
|
||||
_mm512_slli_epi32( b, 7 ) ) ); \
|
||||
a = mm512_xor3( a, b, d ); \
|
||||
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
|
||||
a = mm512_rol_32( a, 5 ); \
|
||||
c = mm512_rol_32( c, 22 ); \
|
||||
} while (0)
|
||||
|
@@ -522,50 +522,53 @@ do { \
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#if defined (__AVX512VL__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
_mm256_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
_mm256_andnot_si256( _mm256_xor_si256( a, b ), c )
|
||||
|
||||
#endif
|
||||
|
||||
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( x0, \
|
||||
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) ) ) \
|
||||
mm256_xor3( x0, mm256_andxor( x1, x0, x4 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) ) \
|
||||
|
||||
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x2, \
|
||||
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ) ), \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
|
||||
mm256_xor3( mm256_andxor( x2, _mm256_andnot_si256( x3, x1 ), \
|
||||
mm256_xor3( _mm256_and_si256( x4, x5 ), x6, x0 ) ), \
|
||||
mm256_andxor( x4, x1, x5 ), \
|
||||
mm256_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), x0 ) )
|
||||
mm256_xor3( x0, \
|
||||
_mm256_and_si256( x3, \
|
||||
mm256_xor3( _mm256_and_si256( x1, x2 ), x6, x0 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ) )
|
||||
|
||||
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
|
||||
_mm256_and_si256( x4, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
|
||||
|
||||
mm256_xor3( \
|
||||
mm256_andxor( x3, x5, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ) ), \
|
||||
_mm256_and_si256( x4, \
|
||||
mm256_xor3( x0, _mm256_andnot_si256( x2, x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ) ), \
|
||||
mm256_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x0, \
|
||||
mm256_not( _mm256_xor_si256( \
|
||||
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), \
|
||||
_mm256_and_si256( x3, x6 ) ) )
|
||||
mm256_andnotxor( mm256_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm256_xor3( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x1, x0, x3, x5, x6, x2, x4)
|
||||
|
@@ -51,15 +51,15 @@ extern "C"{
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
|
||||
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
|
||||
x0 = mm512_xorandnot( x0, x2, cc ); \
|
||||
tmp = mm512_xorand( cc, x0, x1 ); \
|
||||
x0 = mm512_xorand( x0, x2, x3 ); \
|
||||
x3 = mm512_xorandnot( x3, x1, x2 ); \
|
||||
x1 = mm512_xorand( x1, x0, x2 ); \
|
||||
x2 = mm512_xorandnot( x2, x3, x0 ); \
|
||||
x0 = mm512_xoror( x0, x1, x3 ); \
|
||||
x3 = mm512_xorand( x3, x1, x2 ); \
|
||||
x1 = mm512_xorand( x1, tmp, x0 ); \
|
||||
x2 = _mm512_xor_si512( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
@@ -67,11 +67,11 @@ do { \
|
||||
do { \
|
||||
x4 = _mm512_xor_si512( x4, x1 ); \
|
||||
x5 = _mm512_xor_si512( x5, x2 ); \
|
||||
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
|
||||
x6 = mm512_xor3( x6, x3, x0 ); \
|
||||
x7 = _mm512_xor_si512( x7, x0 ); \
|
||||
x0 = _mm512_xor_si512( x0, x5 ); \
|
||||
x1 = _mm512_xor_si512( x1, x6 ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
|
||||
x2 = mm512_xor3( x2, x7, x4 ); \
|
||||
x3 = _mm512_xor_si512( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
@@ -318,12 +318,12 @@ static const sph_u64 C[] = {
|
||||
#define Wz_8W(x, c, n) \
|
||||
do { \
|
||||
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
||||
x ## h = _mm512_or_si512( _mm512_and_si512( \
|
||||
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
|
||||
x ## h = mm512_orand( t, _mm512_srli_epi64( x ## h, (n) ), (c) ); \
|
||||
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
||||
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
|
||||
x ## l = mm512_orand( t, (x ## l >> (n)), (c) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
|
@@ -76,6 +76,9 @@ static const uint64_t RC[] = {
|
||||
#define OR64(d, a, b) (d = _mm512_or_si512(a,b))
|
||||
#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1))
|
||||
#define ROL64(d, v, n) (d = mm512_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c))
|
||||
#define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c))
|
||||
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -238,6 +241,8 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
@@ -255,6 +260,8 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
@@ -419,5 +426,7 @@ keccak512_4way_close(void *cc, void *dst)
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
|
||||
#endif // AVX2
|
||||
|
@@ -110,20 +110,34 @@
|
||||
#ifdef KHI_XO
|
||||
#undef KHI_XO
|
||||
#endif
|
||||
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
XOROR(d, a, b, c); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
OR64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#ifdef KHI_XA
|
||||
#undef KHI_XA
|
||||
#endif
|
||||
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
XORAND(d, a, b, c); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
DECL64(kt); \
|
||||
AND64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#ifdef KHI
|
||||
#undef KHI
|
||||
|
@@ -97,6 +97,21 @@ do { \
|
||||
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = a0;\
|
||||
a0 = mm512_xoror( a3, a0, a1 ); \
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm512_xorand( a2, a3, t ); \
|
||||
a2 = mm512_xorand( a1, a2, a0);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = mm512_xnor(a1,a0);\
|
||||
a0 = t;
|
||||
|
||||
/*
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = _mm512_load_si512(&a0);\
|
||||
a0 = _mm512_or_si512(a0,a1);\
|
||||
@@ -115,7 +130,25 @@ do { \
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = _mm512_xor_si512(a1,a0);\
|
||||
a0 = _mm512_load_si512(&t);
|
||||
*/
|
||||
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = mm512_xoror( a, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = mm512_xoror( b, t1, t2 ); \
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
|
||||
/*
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
@@ -133,6 +166,7 @@ do { \
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
*/
|
||||
|
||||
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm512_shuffle_epi32(a1,147);\
|
||||
@@ -248,17 +282,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
__m512i tmp[2];
|
||||
__m512i x[8];
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm512_xor_si512( t0, chainv[2] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[3] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[4] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[5] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[6] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[7] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[8] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[9] );
|
||||
t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t0 = mm512_xor3( t0, chainv[6], chainv[8] );
|
||||
t1 = mm512_xor3( t1, chainv[7], chainv[9] );
|
||||
|
||||
MULT24W( t0, t1 );
|
||||
|
||||
@@ -319,8 +346,8 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
||||
chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
|
||||
chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
|
||||
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
@@ -398,19 +425,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_4way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[2] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[3] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[4] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[5] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[6] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[7] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[8] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[9] );
|
||||
|
||||
|
||||
t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
|
||||
t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
|
||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
@@ -676,8 +695,6 @@ do { \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
@@ -688,23 +705,23 @@ do { \
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm256_load_si256(&a0);\
|
||||
t = a0;\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
||||
a1 = mm256_not( a1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
||||
a0 = mm256_not( a0 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = _mm256_load_si256(&t);\
|
||||
a0 = t;\
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
|
@@ -312,10 +312,26 @@ do { \
|
||||
BUPDATE1_8W( 7, 1 ); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = mm256_xor3( a ## n0, a ## n1, a ## n4 ) )
|
||||
|
||||
#else
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
(g ## n0 = _mm256_xor_si256( a ## n0, \
|
||||
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||
a ## n4 ) ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define PI_ALL_8W do { \
|
||||
a0 = g0; \
|
||||
a1 = mm256_rol_32( g7, 1 ); \
|
||||
@@ -336,9 +352,6 @@ do { \
|
||||
a16 = mm256_rol_32( g10, 8 ); \
|
||||
} while (0)
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||
a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
|
@@ -127,10 +127,8 @@ void quark_8way_hash( void *state, const void *input )
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
if ( ( vh_mask & 0x0f ) != 0x0f )
|
||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
if ( ( vh_mask & 0xf0 ) != 0xf0 )
|
||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
@@ -139,22 +137,14 @@ void quark_8way_hash( void *state, const void *input )
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
if ( hash0[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
if ( hash1[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
if ( hash2[0] & 8)
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
if ( hash3[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
if ( hash4[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
if ( hash5[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
if ( hash6[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
if ( hash7[0] & 8 )
|
||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
512 );
|
||||
|
@@ -39,17 +39,10 @@
|
||||
void
|
||||
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
|
||||
{
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_context ctx;
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, in, len );
|
||||
sph_sha256_close( &ctx, digest );
|
||||
#else
|
||||
SHA256_CTX ctx;
|
||||
SHA256_Init( &ctx );
|
||||
SHA256_Update( &ctx, in, len );
|
||||
SHA256_Final( digest, &ctx );
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -79,51 +72,29 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
sph_sha256( &ctx->ictx, K, Klen );
|
||||
sph_sha256_close( &ctx->ictx, khash );
|
||||
#else
|
||||
SHA256_Init( &ctx->ictx );
|
||||
SHA256_Update( &ctx->ictx, K, Klen );
|
||||
SHA256_Final( khash, &ctx->ictx );
|
||||
#endif
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->ictx );
|
||||
#else
|
||||
SHA256_Init( &ctx->ictx );
|
||||
#endif
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x36;
|
||||
|
||||
memset( pad + Klen, 0x36, 64 - Klen );
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->ictx, pad, 64 );
|
||||
#else
|
||||
SHA256_Update( &ctx->ictx, pad, 64 );
|
||||
#endif
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_init( &ctx->octx );
|
||||
#else
|
||||
SHA256_Init( &ctx->octx );
|
||||
#endif
|
||||
|
||||
for ( i = 0; i < Klen; i++ ) pad[i] = K[i] ^ 0x5c;
|
||||
|
||||
memset( pad + Klen, 0x5c, 64 - Klen );
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->octx, pad, 64 );
|
||||
#else
|
||||
SHA256_Update( &ctx->octx, pad, 64 );
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
@@ -131,11 +102,7 @@ void
|
||||
HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256( &ctx->ictx, in, len );
|
||||
#else
|
||||
SHA256_Update( &ctx->ictx, in, len );
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
@@ -144,20 +111,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
|
||||
{
|
||||
unsigned char ihash[32];
|
||||
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_close( &ctx->ictx, ihash );
|
||||
sph_sha256( &ctx->octx, ihash, 32 );
|
||||
sph_sha256_close( &ctx->octx, digest );
|
||||
#else
|
||||
/* Finish the inner SHA256 operation. */
|
||||
SHA256_Final( ihash, &ctx->ictx );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
SHA256_Update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
SHA256_Final( digest, &ctx->octx );
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -29,24 +29,14 @@
|
||||
#ifndef HMAC_SHA256_H__
|
||||
#define HMAC_SHA256_H__
|
||||
|
||||
//#define HMAC_SSL_SHA 1
|
||||
#define HMAC_SPH_SHA 1
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
|
||||
typedef struct HMAC_SHA256Context
|
||||
{
|
||||
#if defined(HMAC_SPH_SHA)
|
||||
sph_sha256_context ictx;
|
||||
sph_sha256_context octx;
|
||||
#else
|
||||
SHA256_CTX ictx;
|
||||
SHA256_CTX octx;
|
||||
#endif
|
||||
} HMAC_SHA256_CTX;
|
||||
|
||||
void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
|
@@ -59,6 +59,8 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
@@ -77,6 +79,8 @@ void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -95,6 +99,12 @@ void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
@@ -195,8 +195,28 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
|
||||
hash[i] = swab32(hash[i]);
|
||||
}
|
||||
|
||||
extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
#if defined (__SHA__)
|
||||
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
sph_sha256_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, data, len );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
|
||||
sph_sha256_init( &ctx );
|
||||
sph_sha256( &ctx, hash, 32 );
|
||||
sph_sha256_close( &ctx, hash );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
|
||||
uint32_t S[16], T[16];
|
||||
int i, r;
|
||||
|
||||
@@ -220,6 +240,8 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
|
||||
be32enc((uint32_t *)hash + i, T[i]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline void sha256d_preextend(uint32_t *W)
|
||||
{
|
||||
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
||||
|
345
algo/sha/sha256-hash-2way-ni.c
Normal file
345
algo/sha/sha256-hash-2way-ni.c
Normal file
@@ -0,0 +1,345 @@
|
||||
/* Intel SHA extensions using C intrinsics */
|
||||
/* Written and place in public domain by Jeffrey Walton */
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.h"
|
||||
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
#endif
|
@@ -74,9 +74,20 @@ static const uint32_t K256[64] =
|
||||
#define CHs(X, Y, Z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
|
||||
|
||||
/*
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_or_si128( _mm_and_si128( X, Y ), \
|
||||
_mm_and_si128( _mm_or_si128( X, Y ), Z ) )
|
||||
*/
|
||||
/*
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_xor_si128( Y, _mm_and_si128( _mm_xor_si128( X, Y ), \
|
||||
_mm_xor_si128( Y, Z ) ) )
|
||||
*/
|
||||
|
||||
#define MAJs(X, Y, Z) \
|
||||
_mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
#define BSG2_0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
@@ -94,6 +105,7 @@ static const uint32_t K256[64] =
|
||||
_mm_xor_si128( _mm_xor_si128( \
|
||||
mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
|
||||
|
||||
/*
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
@@ -122,9 +134,9 @@ do { \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
D = _mm_add_epi32( D, T1 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i T1, T2; \
|
||||
@@ -132,16 +144,98 @@ do { \
|
||||
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm_add_epi32( D, T1 ); \
|
||||
H = _mm_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
void sha256_4way_transform( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in )
|
||||
{
|
||||
__m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
__m128i W[16];
|
||||
|
||||
memcpy_128( W, data, 16 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
Y_xor_Z = _mm_xor_si128( B, C );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
static void
|
||||
sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
|
||||
{
|
||||
register __m128i A, B, C, D, E, F, G, H;
|
||||
register __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
__m128i W[16];
|
||||
|
||||
mm128_block_bswap_32( W, in );
|
||||
@@ -170,6 +264,8 @@ sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
|
||||
H = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
Y_xor_Z = _mm_xor_si128( B, C );
|
||||
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
@@ -321,10 +417,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm128_bswap_32( m128_const1_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm128_bswap_32( m128_const1_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
|
||||
sha256_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -342,12 +436,39 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 2), mm256_ror_32(x, 13), mm256_ror_32(x, 22) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 6), mm256_ror_32(x, 11), mm256_ror_32(x, 25) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 7), mm256_ror_32(x, 18), _mm256_srli_epi32(x, 3) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
mm256_xor3( mm256_ror_32(x, 17), mm256_ror_32(x, 19), _mm256_srli_epi32(x, 10) )
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( _mm256_xor_si256( X, Y ), \
|
||||
_mm256_xor_si256( Y, Z ) ) )
|
||||
/*
|
||||
#define MAJx(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
*/
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -365,6 +486,8 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
mm256_ror_32(x, 17), mm256_ror_32(x, 19) ), _mm256_srli_epi32(x, 10) )
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
#define SHA2x_MEXP( a, b, c, d ) \
|
||||
mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] );
|
||||
|
||||
@@ -379,8 +502,89 @@ do { \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
void sha256_8way_transform( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[16];
|
||||
|
||||
memcpy_256( W, data, 16 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm256_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm256_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm256_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm256_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm256_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm256_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm256_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm256_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
static void
|
||||
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
__m256i W[16];
|
||||
@@ -566,10 +770,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm256_bswap_32( m256_const1_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm256_bswap_32( m256_const1_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_round( sc, sc->buf, sc->val );
|
||||
|
||||
@@ -589,27 +791,22 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
// SHA-256 16 way
|
||||
|
||||
#define CHx16(X, Y, Z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
||||
_mm512_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJx16(X, Y, Z) \
|
||||
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
||||
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
||||
_mm512_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG2_0x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
|
||||
mm512_xor3( mm512_ror_32(x, 2), mm512_ror_32(x, 13), mm512_ror_32(x, 22) )
|
||||
|
||||
#define BSG2_1x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
|
||||
mm512_xor3( mm512_ror_32(x, 6), mm512_ror_32(x, 11), mm512_ror_32(x, 25) )
|
||||
|
||||
#define SSG2_0x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) )
|
||||
mm512_xor3( mm512_ror_32(x, 7), mm512_ror_32(x, 18), _mm512_srli_epi32(x, 3) )
|
||||
|
||||
#define SSG2_1x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
|
||||
mm512_xor3( mm512_ror_32(x, 17), mm512_ror_32(x, 19), _mm512_srli_epi32(x, 10) )
|
||||
|
||||
#define SHA2x16_MEXP( a, b, c, d ) \
|
||||
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
|
||||
@@ -625,10 +822,216 @@ do { \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
// Tranform one 16 lane by 64 byte message block and update state.
|
||||
// Calling function is responsible for initializing the state, setting
|
||||
// correct byte order, counting bits and padding of the final block.
|
||||
// It's faster for multiple rounds of sha256 (sha256d/t/q) by eliminating
|
||||
// redundant byte swapping.
|
||||
//
|
||||
void sha256_16way_transform( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
memcpy_512( W, data, 16 );
|
||||
|
||||
A = state_in[0];
|
||||
B = state_in[1];
|
||||
C = state_in[2];
|
||||
D = state_in[3];
|
||||
E = state_in[4];
|
||||
F = state_in[5];
|
||||
G = state_in[6];
|
||||
H = state_in[7];
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
state_out[0] = _mm512_add_epi32( state_in[0], A );
|
||||
state_out[1] = _mm512_add_epi32( state_in[1], B );
|
||||
state_out[2] = _mm512_add_epi32( state_in[2], C );
|
||||
state_out[3] = _mm512_add_epi32( state_in[3], D );
|
||||
state_out[4] = _mm512_add_epi32( state_in[4], E );
|
||||
state_out[5] = _mm512_add_epi32( state_in[5], F );
|
||||
state_out[6] = _mm512_add_epi32( state_in[6], G );
|
||||
state_out[7] = _mm512_add_epi32( state_in[7], H );
|
||||
}
|
||||
|
||||
// Aggresive prehashing
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
|
||||
A = _mm512_load_si512( state_in );
|
||||
B = _mm512_load_si512( state_in + 1 );
|
||||
C = _mm512_load_si512( state_in + 2 );
|
||||
D = _mm512_load_si512( state_in + 3 );
|
||||
E = _mm512_load_si512( state_in + 4 );
|
||||
F = _mm512_load_si512( state_in + 5 );
|
||||
G = _mm512_load_si512( state_in + 6 );
|
||||
H = _mm512_load_si512( state_in + 7 );
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
_mm512_store_si512( state_mid , A );
|
||||
_mm512_store_si512( state_mid + 1, B );
|
||||
_mm512_store_si512( state_mid + 2, C );
|
||||
_mm512_store_si512( state_mid + 3, D );
|
||||
_mm512_store_si512( state_mid + 4, E );
|
||||
_mm512_store_si512( state_mid + 5, F );
|
||||
_mm512_store_si512( state_mid + 6, G );
|
||||
_mm512_store_si512( state_mid + 7, H );
|
||||
}
|
||||
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
memcpy_512( W, data, 16 );
|
||||
|
||||
A = _mm512_load_si512( state_mid );
|
||||
B = _mm512_load_si512( state_mid + 1 );
|
||||
C = _mm512_load_si512( state_mid + 2 );
|
||||
D = _mm512_load_si512( state_mid + 3 );
|
||||
E = _mm512_load_si512( state_mid + 4 );
|
||||
F = _mm512_load_si512( state_mid + 5 );
|
||||
G = _mm512_load_si512( state_mid + 6 );
|
||||
H = _mm512_load_si512( state_mid + 7 );
|
||||
|
||||
// SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
// SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
// SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
A = _mm512_add_epi32( A, _mm512_load_si512( state_in ) );
|
||||
B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) );
|
||||
C = _mm512_add_epi32( C, _mm512_load_si512( state_in + 2 ) );
|
||||
D = _mm512_add_epi32( D, _mm512_load_si512( state_in + 3 ) );
|
||||
E = _mm512_add_epi32( E, _mm512_load_si512( state_in + 4 ) );
|
||||
F = _mm512_add_epi32( F, _mm512_load_si512( state_in + 5 ) );
|
||||
G = _mm512_add_epi32( G, _mm512_load_si512( state_in + 6 ) );
|
||||
H = _mm512_add_epi32( H, _mm512_load_si512( state_in + 7 ) );
|
||||
|
||||
_mm512_store_si512( state_out , A );
|
||||
_mm512_store_si512( state_out + 1, B );
|
||||
_mm512_store_si512( state_out + 2, C );
|
||||
_mm512_store_si512( state_out + 3, D );
|
||||
_mm512_store_si512( state_out + 4, E );
|
||||
_mm512_store_si512( state_out + 5, F );
|
||||
_mm512_store_si512( state_out + 6, G );
|
||||
_mm512_store_si512( state_out + 7, H );
|
||||
}
|
||||
|
||||
static void
|
||||
sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
{
|
||||
register __m512i A, B, C, D, E, F, G, H;
|
||||
register __m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
mm512_block_bswap_32( W , in );
|
||||
@@ -657,6 +1060,7 @@ sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
H = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
@@ -800,10 +1204,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm512_bswap_32( m512_const1_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm512_bswap_32( m512_const1_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_round( sc, sc->buf, sc->val );
|
||||
|
||||
|
@@ -3,23 +3,24 @@
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A drop in replacement for the function of the same name in sph_sha2.c.
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "sha256-hash-opt.h"
|
||||
|
||||
static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i MSG, TMP;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
@@ -31,8 +32,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
MSG = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
// TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
@@ -40,7 +41,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
@@ -49,7 +50,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
@@ -58,7 +59,7 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
@@ -192,9 +193,8 @@ static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state[4], STATE1);
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
18
algo/sha/sha256-hash-opt.h
Normal file
18
algo/sha/sha256-hash-opt.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef SHA2_HASH_OPT_H__
|
||||
#define SHA2_HASH_OPT_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
void sha256_opt_transform( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in );
|
||||
|
||||
// 2 way with interleaved instructions
|
||||
void sha256_ni2way_transform( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
#endif
|
||||
#endif
|
252
algo/sha/sha256d-4way.c
Normal file
252
algo/sha/sha256d-4way.c
Normal file
@@ -0,0 +1,252 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(SHA256D_16WAY)
|
||||
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
|
||||
int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_4WAY)
|
||||
|
||||
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -7,133 +7,173 @@
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_16way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
|
||||
sha256_16way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
|
||||
|
||||
sha256_16way_update( &ctx, input + (64<<4), 16 );
|
||||
sha256_16way_close( &ctx, vhash );
|
||||
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, vhash, 32 );
|
||||
sha256_16way_close( &ctx, vhash );
|
||||
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, vhash, 32 );
|
||||
sha256_16way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8*16] __attribute__ ((aligned (32)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = &(hash32[7<<4]);
|
||||
__m512i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m512_const1_32( pdata[i] );
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
sha256_16way_init( &sha256_ctx16 );
|
||||
sha256_16way_update( &sha256_ctx16, vdata, 64 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m512_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_16way_transform( midstate, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate );
|
||||
|
||||
do
|
||||
{
|
||||
pdata[19] = n;
|
||||
sha256t_16way_hash( hash32, vdata );
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
n += 16;
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_512( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_512( block + 5, 10 );
|
||||
block[15] = m512_const1_32( 80*8 ); // bit count
|
||||
sha256_16way_final_rounds( hash32, block, midstate, midstate2 );
|
||||
// sha256_16way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = m512_const1_32( 32*8 ); // bit count
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_512( block, hash32, 8 );
|
||||
sha256_16way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_8way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
sha256_8way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
||||
|
||||
sha256_8way_update( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = &(hash32[7<<3]);
|
||||
__m256i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way_update( &sha256_ctx8, vdata, 64 );
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m256_const1_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m256_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_8way_transform( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
pdata[19] = n;
|
||||
sha256t_8way_hash( hash32, vdata );
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
n += 8;
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_256( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_256( block + 5, 10 );
|
||||
block[15] = m256_const1_32( 80*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = m256_const1_32( 32*8 ); // bit count
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_256( block, hash32, 8 );
|
||||
sha256_8way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -144,82 +184,84 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_4way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
sha256_4way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
||||
|
||||
sha256_4way_update( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[20] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
|
||||
const uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
const uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = m128_const1_32( pdata[i] );
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way_update( &sha256_ctx4, vdata, 64 );
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
// initialize state
|
||||
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = m128_const1_64( 0x510E527F510E527F );
|
||||
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
|
||||
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
const uint32_t mask = masks[m];
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
pdata[19] = n;
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy_128( block, vdata + 16, 4 );
|
||||
block[ 4] = last_byte;
|
||||
memset_zero_128( block + 5, 10 );
|
||||
block[15] = m128_const1_32( 80*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, midstate );
|
||||
|
||||
sha256t_4way_hash( hash, vdata );
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = m128_const1_32( 32*8 ); // bit count
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( !( hash7[ lane ] & mask ) )
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy_128( block, hash32, 8 );
|
||||
sha256_4way_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -5,17 +5,13 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||
gate->hash = (void*)&sha256t_16way_hash;
|
||||
#elif defined(__SHA__)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->hash = (void*)&sha256t_hash;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
gate->hash = (void*)&sha256t_8way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256t_4way;
|
||||
gate->hash = (void*)&sha256t_4way_hash;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
@@ -17,7 +17,6 @@ bool register_sha256q_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
void sha256t_16way_hash( void *output, const void *input );
|
||||
int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_16way_hash( void *output, const void *input );
|
||||
@@ -27,7 +26,6 @@ int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
void sha256t_8way_hash( void *output, const void *input );
|
||||
int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_8way_hash( void *output, const void *input );
|
||||
@@ -37,7 +35,6 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
void sha256t_4way_hash( void *output, const void *input );
|
||||
int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_4way_hash( void *output, const void *input );
|
||||
@@ -45,10 +42,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
int sha256t_hash( void *output, const void *input );
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
int sha256q_hash( void *output, const void *input );
|
||||
int scanhash_sha256q( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -3,10 +3,14 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
//#include "algo/sha/sph_sha2.h"
|
||||
#include "sha256-hash-opt.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
// Only used on CPUs with SHA
|
||||
|
||||
/*
|
||||
static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256t_midstate( const void* input )
|
||||
@@ -37,12 +41,21 @@ int sha256t_hash( void* output, const void* input )
|
||||
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t block[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
|
||||
|
||||
|
||||
// uint32_t edata[20] __attribute__((aligned(64)));
|
||||
// uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -50,24 +63,148 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
sha256t_midstate( edata );
|
||||
// mm128_bswap32_80( edata, pdata );
|
||||
// sha256t_midstate( edata );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = n;
|
||||
if ( likely( sha256t_hash( hash, edata ) ) )
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block, pdata + 16, 16 );
|
||||
block[ 4] = 0x80000000;
|
||||
memset( block + 5, 0, 40 );
|
||||
block[15] = 80*8; // bit count
|
||||
sha256_opt_transform( hash32, block, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block, hash32, 32 );
|
||||
block[ 8] = 0x80000000;
|
||||
memset( block + 9, 0, 24 );
|
||||
block[15] = 32*8; // bit count
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block, hash32, 32 );
|
||||
sha256_opt_transform( hash32, block, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash32, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash32, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash32, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
|
||||
submit_solution( work, hash32, mythr );
|
||||
n++;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, midstate, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
sha256_ni2way_transform( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -96,74 +96,22 @@ static const uint64_t K512[80] =
|
||||
// SHA-512 8 way 64 bit
|
||||
|
||||
#define CH8W(X, Y, Z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xca )
|
||||
|
||||
#define MAJ8W(X, Y, Z) \
|
||||
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
||||
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
||||
_mm512_ternarylogic_epi64( X, Y, Z, 0xe8 )
|
||||
|
||||
#define BSG8W_5_0(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
|
||||
mm512_xor3( mm512_ror_64(x, 28), mm512_ror_64(x, 34), mm512_ror_64(x, 39) )
|
||||
|
||||
#define BSG8W_5_1(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
|
||||
mm512_xor3( mm512_ror_64(x, 14), mm512_ror_64(x, 18), mm512_ror_64(x, 41) )
|
||||
|
||||
#define SSG8W_5_0(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
|
||||
mm512_xor3( mm512_ror_64(x, 1), mm512_ror_64(x, 8), _mm512_srli_epi64(x, 7) )
|
||||
|
||||
#define SSG8W_5_1(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
|
||||
|
||||
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
|
||||
{
|
||||
__m512i w0a, w1a, w0b, w1b;
|
||||
w0a = mm512_ror_64( w0, 1 );
|
||||
w1a = mm512_ror_64( w1,19 );
|
||||
w0b = mm512_ror_64( w0, 8 );
|
||||
w1b = mm512_ror_64( w1,61 );
|
||||
w0a = _mm512_xor_si512( w0a, w0b );
|
||||
w1a = _mm512_xor_si512( w1a, w1b );
|
||||
w0b = _mm512_srli_epi64( w0, 7 );
|
||||
w1b = _mm512_srli_epi64( w1, 6 );
|
||||
w0a = _mm512_xor_si512( w0a, w0b );
|
||||
w1a = _mm512_xor_si512( w1a, w1b );
|
||||
return _mm512_add_epi64( w0a, w1a );
|
||||
}
|
||||
|
||||
|
||||
#define SSG8W_512x2_0( w0, w1, i ) do \
|
||||
{ \
|
||||
__m512i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm512_ror_64( W[i-15], 1 ); \
|
||||
X1a = mm512_ror_64( W[i-14], 1 ); \
|
||||
X0b = mm512_ror_64( W[i-15], 8 ); \
|
||||
X1b = mm512_ror_64( W[i-14], 8 ); \
|
||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
|
||||
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
|
||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||
} while(0)
|
||||
|
||||
#define SSG8W_512x2_1( w0, w1, i ) do \
|
||||
{ \
|
||||
__m512i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm512_ror_64( W[i-2],19 ); \
|
||||
X1a = mm512_ror_64( W[i-1],19 ); \
|
||||
X0b = mm512_ror_64( W[i-2],61 ); \
|
||||
X1b = mm512_ror_64( W[i-1],61 ); \
|
||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
|
||||
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
|
||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||
} while(0)
|
||||
mm512_xor3( mm512_ror_64(x, 19), mm512_ror_64(x, 61), _mm512_srli_epi64(x, 6) )
|
||||
|
||||
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
@@ -187,8 +135,8 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
mm512_block_bswap_64( W+8, in+8 );
|
||||
|
||||
for ( i = 16; i < 80; i++ )
|
||||
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
|
||||
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
|
||||
W[i] = mm512_add4_64( SSG8W_5_0( W[i-15] ), SSG8W_5_1( W[i-2] ),
|
||||
W[ i- 7 ], W[ i-16 ] );
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
@@ -319,14 +267,20 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
/*
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
/*
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_or_si256( _mm256_and_si256( X, Y ), \
|
||||
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
|
||||
*/
|
||||
|
||||
#define MAJ(X, Y, Z) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
#define BSG5_0(x) \
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 )
|
||||
@@ -334,7 +288,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
#define BSG5_1(x) \
|
||||
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
|
||||
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
|
||||
*/
|
||||
|
||||
/*
|
||||
#define BSG5_0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -402,7 +356,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
|
||||
w1 = _mm256_xor_si256( X1a, X1b ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i K = _mm256_set1_epi64x( K512[ i ] ); \
|
||||
@@ -431,7 +385,7 @@ do { \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
} while (0)
|
||||
|
||||
*/
|
||||
/*
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
@@ -445,7 +399,7 @@ do { \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m256i T1, T2; \
|
||||
@@ -453,16 +407,17 @@ do { \
|
||||
T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi64( D, T1 ); \
|
||||
H = _mm256_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
|
||||
static void
|
||||
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
{
|
||||
int i;
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
|
||||
__m256i W[80];
|
||||
|
||||
mm256_block_bswap_64( W , in );
|
||||
@@ -495,6 +450,8 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
H = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
{
|
||||
SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
|
||||
|
@@ -40,8 +40,8 @@
|
||||
#endif
|
||||
|
||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||
#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
||||
|
||||
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
||||
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
|
||||
#define ROTR SPH_ROTR32
|
||||
|
||||
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
|
||||
@@ -73,7 +73,194 @@ static const sph_u32 H256[8] = {
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash-opt.c"
|
||||
#include "simd-utils.h"
|
||||
|
||||
static void sha2_round( const uint8_t input[], uint32_t state[8] )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
MSG = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state[4], STATE1);
|
||||
}
|
||||
|
||||
#else // no SHA
|
||||
|
||||
@@ -132,6 +319,7 @@ static const sph_u32 K[64] = {
|
||||
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
|
||||
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
|
||||
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
d = SPH_T32(d + t1); \
|
||||
h = SPH_T32(t1 + t2); \
|
||||
} while (0)
|
||||
@@ -142,7 +330,7 @@ static const sph_u32 K[64] = {
|
||||
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
|
||||
|
||||
#define SHA2_ROUND_BODY(in, r) do { \
|
||||
sph_u32 A, B, C, D, E, F, G, H; \
|
||||
sph_u32 A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; \
|
||||
sph_u32 W[16]; \
|
||||
unsigned pcount; \
|
||||
\
|
||||
@@ -155,6 +343,7 @@ static const sph_u32 K[64] = {
|
||||
G = (r)[6]; \
|
||||
H = (r)[7]; \
|
||||
pcount = 0; \
|
||||
Y_xor_Z = B ^ C; \
|
||||
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
|
||||
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
|
||||
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
|
||||
@@ -202,7 +391,7 @@ static const sph_u32 K[64] = {
|
||||
#else // large footprint (default)
|
||||
|
||||
#define SHA2_ROUND_BODY(in, r) do { \
|
||||
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
|
||||
sph_u32 A, B, C, D, E, F, G, H, T1, T2, X_xor_Y, Y_xor_Z;; \
|
||||
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
|
||||
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
|
||||
\
|
||||
@@ -214,388 +403,453 @@ static const sph_u32 K[64] = {
|
||||
F = (r)[5]; \
|
||||
G = (r)[6]; \
|
||||
H = (r)[7]; \
|
||||
Y_xor_Z = B ^ C; \
|
||||
W00 = in(0); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x428A2F98) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = in(1); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x71374491) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = in(2); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xB5C0FBCF) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = in(3); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xE9B5DBA5) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = in(4); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x3956C25B) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = in(5); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x59F111F1) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = in(6); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x923F82A4) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = in(7); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xAB1C5ED5) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = in(8); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xD807AA98) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = in(9); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x12835B01) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = in(10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x243185BE) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = in(11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x550C7DC3) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = in(12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x72BE5D74) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = in(13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x80DEB1FE) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = in(14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x9BDC06A7) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = in(15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xC19BF174) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xE49B69C1) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xEFBE4786) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x0FC19DC6) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x240CA1CC) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x2DE92C6F) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x4A7484AA) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x5CB0A9DC) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x76F988DA) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x983E5152) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xA831C66D) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xB00327C8) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xBF597FC7) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0xC6E00BF3) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xD5A79147) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x06CA6351) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x14292967) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x27B70A85) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x2E1B2138) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x4D2C6DFC) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x53380D13) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x650A7354) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x766A0ABB) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x81C2C92E) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x92722C85) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0xA2BFE8A1) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0xA81A664B) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0xC24B8B70) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0xC76C51A3) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0xD192E819) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xD6990624) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0xF40E3585) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x106AA070) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x19A4C116) + W00); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x1E376C08) + W01); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x2748774C) + W02); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x34B0BCB5) + W03); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x391C0CB3) + W04); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0x4ED8AA4A) + W05); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0x5B9CCA4F) + W06); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0x682E6FF3) + W07); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
|
||||
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
|
||||
+ SPH_C32(0x748F82EE) + W08); \
|
||||
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = SPH_T32(D + T1); \
|
||||
H = SPH_T32(T1 + T2); \
|
||||
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
|
||||
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
|
||||
+ SPH_C32(0x78A5636F) + W09); \
|
||||
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
C = SPH_T32(C + T1); \
|
||||
G = SPH_T32(T1 + T2); \
|
||||
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
|
||||
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
|
||||
+ SPH_C32(0x84C87814) + W10); \
|
||||
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
B = SPH_T32(B + T1); \
|
||||
F = SPH_T32(T1 + T2); \
|
||||
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
|
||||
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
|
||||
+ SPH_C32(0x8CC70208) + W11); \
|
||||
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
A = SPH_T32(A + T1); \
|
||||
E = SPH_T32(T1 + T2); \
|
||||
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
|
||||
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
|
||||
+ SPH_C32(0x90BEFFFA) + W12); \
|
||||
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
H = SPH_T32(H + T1); \
|
||||
D = SPH_T32(T1 + T2); \
|
||||
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
|
||||
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
|
||||
+ SPH_C32(0xA4506CEB) + W13); \
|
||||
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
G = SPH_T32(G + T1); \
|
||||
C = SPH_T32(T1 + T2); \
|
||||
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
|
||||
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
|
||||
+ SPH_C32(0xBEF9A3F7) + W14); \
|
||||
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
F = SPH_T32(F + T1); \
|
||||
B = SPH_T32(T1 + T2); \
|
||||
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
|
||||
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
|
||||
+ SPH_C32(0xC67178F2) + W15); \
|
||||
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
E = SPH_T32(E + T1); \
|
||||
A = SPH_T32(T1 + T2); \
|
||||
(r)[0] = SPH_T32((r)[0] + A); \
|
||||
@@ -691,6 +945,14 @@ sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
// sph_sha256_init(cc);
|
||||
}
|
||||
|
||||
void sph_sha256_full( void *dst, const void *data, size_t len )
|
||||
{
|
||||
sph_sha256_context cc;
|
||||
sph_sha256_init( &cc );
|
||||
sph_sha256( &cc, data, len );
|
||||
sph_sha256_close( &cc, dst );
|
||||
}
|
||||
|
||||
/* see sph_sha2.h */
|
||||
//void
|
||||
//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
|
||||
|
@@ -205,6 +205,10 @@ void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
|
||||
#define sph_sha256_comp sph_sha224_comp
|
||||
#endif
|
||||
|
||||
void sph_sha256_full( void *dst, const void *data, size_t len );
|
||||
|
||||
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
|
@@ -38,7 +38,8 @@
|
||||
#if SPH_64
|
||||
|
||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
||||
//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
||||
#define MAJ( X, Y, Z ) ( Y ^ ( ( X ^ Y ) & ( Y ^ Z ) ) )
|
||||
|
||||
#define ROTR64 SPH_ROTR64
|
||||
|
||||
|
@@ -310,12 +310,13 @@ do { \
|
||||
|
||||
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
do { \
|
||||
xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \
|
||||
xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \
|
||||
_mm256_andnot_si256( xb3, xb2 ), \
|
||||
_mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
|
||||
) ), _mm256_set1_epi32(3UL) ) ) ) ); \
|
||||
xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
|
||||
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
|
||||
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
|
||||
_mm256_set1_epi32(5UL) ) ), \
|
||||
_mm256_set1_epi32(3UL) ) ) ); \
|
||||
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0_8 do { \
|
||||
|
@@ -23,6 +23,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
||||
__m512i *M = (__m512i*)msg;
|
||||
__m512i *H = (__m512i*)ctx->h;
|
||||
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
|
||||
ctx->count1, ctx->count0 );
|
||||
int r;
|
||||
|
||||
P0 = H[0];
|
||||
@@ -62,16 +64,16 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32(
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
||||
K0 = _mm512_xor_si512( K0,
|
||||
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K0,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror128_32(
|
||||
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K1,
|
||||
@@ -96,8 +98,8 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
||||
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
|
||||
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
@@ -101,15 +101,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round
|
||||
|
||||
// working proof of concept
|
||||
/*
|
||||
__m512i K = m512_const1_128( m[0] );
|
||||
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
|
||||
X = _mm512_aesenc_epi128( X, m512_zero );
|
||||
k00 = _mm512_castsi512_si128( K );
|
||||
x = _mm512_castsi512_si128( X );
|
||||
*/
|
||||
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
@@ -747,11 +747,6 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
|
||||
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
||||
|
||||
|
||||
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
||||
// c1_16(185), c1_16(233) };
|
||||
|
||||
|
||||
S0l = _mm512_xor_si512( S[0], M[0] );
|
||||
S0h = _mm512_xor_si512( S[1], M[1] );
|
||||
S1l = _mm512_xor_si512( S[2], M[2] );
|
||||
@@ -764,11 +759,16 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
// targetted, local macros don't need a unique name
|
||||
#define S(i) S##i
|
||||
|
||||
#define F_0( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xca )
|
||||
#define F_1( B, C, D ) _mm512_ternarylogic_epi32( B, C, D, 0xe8 )
|
||||
|
||||
/*
|
||||
#define F_0(B, C, D) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( C,D ), B ), D )
|
||||
#define F_1(B, C, D) \
|
||||
_mm512_or_si512( _mm512_and_si512( D, C ),\
|
||||
_mm512_and_si512( _mm512_or_si512( D,C ), B ) )
|
||||
*/
|
||||
|
||||
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
|
||||
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
|
||||
|
@@ -6,10 +6,6 @@
|
||||
|
||||
#define PRINT_SOME 0
|
||||
|
||||
/* JDD all ocurrances of macro X in this file renamed to XX
|
||||
* due to name conflict
|
||||
*/
|
||||
|
||||
int SupportedLength(int hashbitlen) {
|
||||
if (hashbitlen <= 0 || hashbitlen > 512)
|
||||
return 0;
|
||||
|
@@ -309,22 +309,16 @@ static const uint64_t IV512[] = {
|
||||
sc->bcount = bcount; \
|
||||
} while (0)
|
||||
|
||||
// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
|
||||
do { \
|
||||
k8 = _mm512_xor_si512( _mm512_xor_si512( \
|
||||
_mm512_xor_si512( _mm512_xor_si512( k0, k1 ), \
|
||||
_mm512_xor_si512( k2, k3 ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( k4, k5 ), \
|
||||
_mm512_xor_si512( k6, k7 ) ) ), \
|
||||
m512_const1_64( 0x1BD11BDAA9FC1A22) ); \
|
||||
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
|
||||
mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
|
||||
t2 = t0 ^ t1; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
|
||||
do { \
|
||||
w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
|
||||
@@ -340,7 +334,6 @@ do { \
|
||||
m512_const1_64( s ) ) ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define TFBIG_MIX_8WAY(x0, x1, rc) \
|
||||
do { \
|
||||
x0 = _mm512_add_epi64( x0, x1 ); \
|
||||
|
741
algo/verthash/Verthash.c
Normal file
741
algo/verthash/Verthash.c
Normal file
@@ -0,0 +1,741 @@
|
||||
/*
|
||||
* Copyright 2018-2021 CryptoGraphics
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version. See LICENSE for more details.
|
||||
*/
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "Verthash.h"
|
||||
#include "mm_malloc.h"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Verthash info management
|
||||
int verthash_info_init(verthash_info_t* info, const char* file_name)
|
||||
{
|
||||
// init fields to 0
|
||||
info->fileName = NULL;
|
||||
info->data = NULL;
|
||||
info->dataSize = 0;
|
||||
info->bitmask = 0;
|
||||
size_t fileNameLen;
|
||||
|
||||
if ( !file_name || !( fileNameLen = strlen( file_name ) ) )
|
||||
{
|
||||
applog( LOG_ERR, "Invalid file specification" );
|
||||
return -1;
|
||||
}
|
||||
|
||||
info->fileName = (char*)malloc( fileNameLen + 1 );
|
||||
if ( !info->fileName )
|
||||
{
|
||||
applog( LOG_ERR, "Failed to allocate memory for Verthash data" );
|
||||
return -1;
|
||||
}
|
||||
|
||||
memset( info->fileName, 0, fileNameLen + 1 );
|
||||
memcpy( info->fileName, file_name, fileNameLen );
|
||||
|
||||
FILE *fileMiningData = fopen_utf8( info->fileName, "rb" );
|
||||
if ( !fileMiningData )
|
||||
{
|
||||
if ( opt_data_file || !opt_verify )
|
||||
{
|
||||
if ( opt_data_file )
|
||||
applog( LOG_ERR, "Verthash data file not found or invalid: %s",
|
||||
info->fileName );
|
||||
else
|
||||
{
|
||||
applog( LOG_ERR,
|
||||
"No Verthash data file specified and default not found");
|
||||
applog( LOG_NOTICE,
|
||||
"Add '--verify' to create default 'verthash.dat'");
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
applog( LOG_NOTICE, "Creating default 'verthash.dat' in current directory, this will take several minutes");
|
||||
if ( verthash_generate_data_file( info->fileName ) )
|
||||
return -1;
|
||||
|
||||
fileMiningData = fopen_utf8( info->fileName, "rb" );
|
||||
if ( !fileMiningData )
|
||||
{
|
||||
applog( LOG_ERR, "File system error opening %s", info->fileName );
|
||||
return -1;
|
||||
}
|
||||
|
||||
applog( LOG_NOTICE, "Verthash data file created successfully" );
|
||||
}
|
||||
}
|
||||
|
||||
// Get file size
|
||||
fseek(fileMiningData, 0, SEEK_END);
|
||||
int fileSize = ftell(fileMiningData);
|
||||
fseek(fileMiningData, 0, SEEK_SET);
|
||||
|
||||
if ( fileSize < 0 )
|
||||
{
|
||||
fclose(fileMiningData);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Allocate data
|
||||
info->data = (uint8_t *)_mm_malloc( fileSize, 64 );
|
||||
if (!info->data)
|
||||
{
|
||||
fclose(fileMiningData);
|
||||
// Memory allocation fatal error.
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Load data
|
||||
if ( !fread( info->data, fileSize, 1, fileMiningData ) )
|
||||
{
|
||||
applog( LOG_ERR, "File system error reading %s", info->fileName );
|
||||
fclose(fileMiningData);
|
||||
return -1;
|
||||
}
|
||||
|
||||
fclose(fileMiningData);
|
||||
|
||||
// Update fields
|
||||
info->bitmask = ((fileSize - VH_HASH_OUT_SIZE)/VH_BYTE_ALIGNMENT) + 1;
|
||||
info->dataSize = fileSize;
|
||||
|
||||
applog( LOG_NOTICE, "Using Verthash data file '%s'", info->fileName );
|
||||
return 0;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
void verthash_info_free(verthash_info_t* info)
|
||||
{
|
||||
free(info->fileName);
|
||||
free(info->data);
|
||||
info->dataSize = 0;
|
||||
info->bitmask = 0;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Verthash hash
|
||||
#define VH_P0_SIZE 64
|
||||
#define VH_N_ITER 8
|
||||
#define VH_N_SUBSET VH_P0_SIZE*VH_N_ITER
|
||||
#define VH_N_ROT 32
|
||||
#define VH_N_INDEXES 4096
|
||||
#define VH_BYTE_ALIGNMENT 16
|
||||
|
||||
static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
|
||||
{
|
||||
return (a ^ b) * 0x1000193;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void rotate_indexes( uint32_t *p )
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 )
|
||||
{
|
||||
__m256i *px = (__m256i*)p + x;
|
||||
|
||||
px[0] = mm256_rol_32( px[0], 1 );
|
||||
px[1] = mm256_rol_32( px[1], 1 );
|
||||
px[2] = mm256_rol_32( px[2], 1 );
|
||||
px[3] = mm256_rol_32( px[3], 1 );
|
||||
px[4] = mm256_rol_32( px[4], 1 );
|
||||
px[5] = mm256_rol_32( px[5], 1 );
|
||||
px[6] = mm256_rol_32( px[6], 1 );
|
||||
px[7] = mm256_rol_32( px[7], 1 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 )
|
||||
{
|
||||
__m128i *px = (__m128i*)p0_index + x;
|
||||
|
||||
px[0] = mm128_rol_32( px[0], 1 );
|
||||
px[1] = mm128_rol_32( px[1], 1 );
|
||||
px[2] = mm128_rol_32( px[2], 1 );
|
||||
px[3] = mm128_rol_32( px[3], 1 );
|
||||
px[4] = mm128_rol_32( px[4], 1 );
|
||||
px[5] = mm128_rol_32( px[5], 1 );
|
||||
px[6] = mm128_rol_32( px[6], 1 );
|
||||
px[7] = mm128_rol_32( px[7], 1 );
|
||||
}
|
||||
|
||||
#endif
|
||||
/*
|
||||
for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x )
|
||||
p[x] = ( p[x] << 1 ) | ( p[x] >> 31 );
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint32_t rotl32( uint32_t a, size_t r )
|
||||
{
|
||||
return ( a << r ) | ( a >> (32-r) );
|
||||
}
|
||||
|
||||
// Vectorized and targetted version of fnv1a
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#define MULXOR \
|
||||
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
|
||||
*(__m256i*)hash, *(__m256i*)blob_off ), k );
|
||||
|
||||
#elif defined(__SSE41__)
|
||||
|
||||
#define MULXOR \
|
||||
casti_m128i( hash, 0 ) = _mm_mullo_epi32( _mm_xor_si128( \
|
||||
casti_m128i( hash, 0 ), casti_m128i( blob_off, 0 ) ), k ); \
|
||||
casti_m128i( hash, 1 ) = _mm_mullo_epi32( _mm_xor_si128( \
|
||||
casti_m128i( hash, 1 ), casti_m128i( blob_off, 1 ) ), k );
|
||||
|
||||
#else
|
||||
|
||||
#define MULXOR \
|
||||
for ( size_t j = 0; j < VH_HASH_OUT_SIZE / sizeof(uint32_t); j++ ) \
|
||||
hash[j] = fnv1a( hash[j], blob_off[j] ); \
|
||||
|
||||
#endif
|
||||
|
||||
#define UPDATE_ACCUMULATOR \
|
||||
accumulator = fnv1a( accumulator, blob_off[0] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[1] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[2] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[3] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[4] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[5] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[6] ); \
|
||||
accumulator = fnv1a( accumulator, blob_off[7] )
|
||||
|
||||
|
||||
// first pass no rotate
|
||||
#define ROUND_0 \
|
||||
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
{ \
|
||||
const uint32_t *blob_off = blob + \
|
||||
( ( fnv1a( subset[i], accumulator ) % mdiv ) \
|
||||
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||
UPDATE_ACCUMULATOR; \
|
||||
MULXOR; \
|
||||
}
|
||||
|
||||
// subsequent passes rotate by r on demand, no need for mass rotate
|
||||
#define ROUND_r( r ) \
|
||||
for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \
|
||||
{ \
|
||||
const uint32_t *blob_off = blob + \
|
||||
( ( fnv1a( rotl32( subset[i], r ), accumulator ) % mdiv ) \
|
||||
* ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) ) ); \
|
||||
UPDATE_ACCUMULATOR; \
|
||||
MULXOR; \
|
||||
}
|
||||
|
||||
void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||
const void *input, void *output )
|
||||
{
|
||||
uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64)));
|
||||
uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64)));
|
||||
const uint32_t *blob = (const uint32_t*)blob_bytes;
|
||||
uint32_t accumulator = 0x811c9dc5;
|
||||
const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
|
||||
/ VH_BYTE_ALIGNMENT ) + 1;
|
||||
#if defined (__AVX2__)
|
||||
const __m256i k = _mm256_set1_epi32( 0x1000193 );
|
||||
#elif defined(__SSE41__)
|
||||
const __m128i k = _mm_set1_epi32( 0x1000193 );
|
||||
#endif
|
||||
|
||||
sha3( input, VH_HEADER_SIZE, hash, VH_HASH_OUT_SIZE );
|
||||
verthash_sha3_512_final_8( subset, ( (uint64_t*)input )[ 9 ] );
|
||||
|
||||
ROUND_0;
|
||||
for ( size_t r = 1; r < VH_N_ROT; ++r )
|
||||
ROUND_r( r );
|
||||
|
||||
memcpy( output, hash, VH_HASH_OUT_SIZE );
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Verthash data file generator
|
||||
|
||||
#define NODE_SIZE 32
|
||||
|
||||
struct Graph
|
||||
{
|
||||
FILE *db;
|
||||
int64_t log2;
|
||||
int64_t pow2;
|
||||
uint8_t *pk;
|
||||
int64_t index;
|
||||
};
|
||||
|
||||
int64_t Log2(int64_t x)
|
||||
{
|
||||
int64_t r = 0;
|
||||
for (; x > 1; x >>= 1)
|
||||
{
|
||||
r++;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int64_t bfsToPost(struct Graph *g, const int64_t node)
|
||||
{
|
||||
return node & ~g->pow2;
|
||||
}
|
||||
|
||||
int64_t numXi(int64_t index)
|
||||
{
|
||||
return (1 << ((uint64_t)index)) * (index + 1) * index;
|
||||
}
|
||||
|
||||
void WriteId(struct Graph *g, uint8_t *Node, const int64_t id)
|
||||
{
|
||||
fseek(g->db, id * NODE_SIZE, SEEK_SET);
|
||||
fwrite(Node, 1, NODE_SIZE, g->db);
|
||||
}
|
||||
|
||||
void WriteNode(struct Graph *g, uint8_t *Node, const int64_t id)
|
||||
{
|
||||
const int64_t idx = bfsToPost(g, id);
|
||||
WriteId(g, Node, idx);
|
||||
}
|
||||
|
||||
void NewNode(struct Graph *g, const int64_t id, uint8_t *hash)
|
||||
{
|
||||
WriteNode(g, hash, id);
|
||||
}
|
||||
|
||||
uint8_t *GetId(struct Graph *g, const int64_t id)
|
||||
{
|
||||
fseek(g->db, id * NODE_SIZE, SEEK_SET);
|
||||
uint8_t *node = (uint8_t *)malloc(NODE_SIZE);
|
||||
const size_t bytes_read = fread(node, 1, NODE_SIZE, g->db);
|
||||
if(bytes_read != NODE_SIZE) {
|
||||
return NULL;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
uint8_t *GetNode(struct Graph *g, const int64_t id)
|
||||
{
|
||||
const int64_t idx = bfsToPost(g, id);
|
||||
return GetId(g, idx);
|
||||
}
|
||||
|
||||
uint32_t WriteVarInt(uint8_t *buffer, int64_t val)
|
||||
{
|
||||
memset(buffer, 0, NODE_SIZE);
|
||||
uint64_t uval = ((uint64_t)(val)) << 1;
|
||||
if (val < 0)
|
||||
{
|
||||
uval = ~uval;
|
||||
}
|
||||
uint32_t i = 0;
|
||||
while (uval >= 0x80)
|
||||
{
|
||||
buffer[i] = (uint8_t)uval | 0x80;
|
||||
uval >>= 7;
|
||||
i++;
|
||||
}
|
||||
buffer[i] = (uint8_t)uval;
|
||||
return i;
|
||||
}
|
||||
|
||||
void ButterflyGraph(struct Graph *g, int64_t index, int64_t *count)
|
||||
{
|
||||
if (index == 0)
|
||||
{
|
||||
index = 1;
|
||||
}
|
||||
|
||||
int64_t numLevel = 2 * index;
|
||||
int64_t perLevel = (int64_t)(1 << (uint64_t)index);
|
||||
int64_t begin = *count - perLevel;
|
||||
int64_t level, i;
|
||||
|
||||
for (level = 1; level < numLevel; level++)
|
||||
{
|
||||
for (i = 0; i < perLevel; i++)
|
||||
{
|
||||
int64_t prev;
|
||||
int64_t shift = index - level;
|
||||
if (level > numLevel / 2)
|
||||
{
|
||||
shift = level - numLevel / 2;
|
||||
}
|
||||
if (((i >> (uint64_t)shift) & 1) == 0)
|
||||
{
|
||||
prev = i + (1 << (uint64_t)shift);
|
||||
}
|
||||
else
|
||||
{
|
||||
prev = i - (1 << (uint64_t)shift);
|
||||
}
|
||||
|
||||
uint8_t *parent0 = GetNode(g, begin + (level - 1) * perLevel + prev);
|
||||
uint8_t *parent1 = GetNode(g, *count - perLevel);
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, *count);
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
|
||||
|
||||
NewNode(g, *count, hashOutput);
|
||||
(*count)++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(parent0);
|
||||
free(parent1);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void XiGraphIter(struct Graph *g, int64_t index)
|
||||
{
|
||||
int64_t count = g->pow2;
|
||||
|
||||
int8_t stackSize = 5;
|
||||
int64_t *stack = (int64_t *)malloc(sizeof(int64_t) * stackSize);
|
||||
for (int i = 0; i < 5; i++)
|
||||
stack[i] = index;
|
||||
|
||||
int8_t graphStackSize = 5;
|
||||
int32_t *graphStack = (int32_t *)malloc(sizeof(int32_t) * graphStackSize);
|
||||
for (int i = 0; i < 5; i++)
|
||||
graphStack[i] = graphStackSize - i - 1;
|
||||
|
||||
int64_t i = 0;
|
||||
int64_t graph = 0;
|
||||
int64_t pow2index = 1 << ((uint64_t)index);
|
||||
|
||||
for (i = 0; i < pow2index; i++)
|
||||
{
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, count);
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 2);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
|
||||
sha3(hashInput, NODE_SIZE * 2, hashOutput, NODE_SIZE);
|
||||
NewNode(g, count, hashOutput);
|
||||
count++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
if (index == 1)
|
||||
{
|
||||
ButterflyGraph(g, index, &count);
|
||||
return;
|
||||
}
|
||||
|
||||
while (stackSize != 0 && graphStackSize != 0)
|
||||
{
|
||||
|
||||
index = stack[stackSize - 1];
|
||||
graph = graphStack[graphStackSize - 1];
|
||||
|
||||
stackSize--;
|
||||
if (stackSize > 0)
|
||||
{
|
||||
int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize));
|
||||
memcpy(tempStack, stack, sizeof(int64_t) * (stackSize));
|
||||
free(stack);
|
||||
stack = tempStack;
|
||||
}
|
||||
|
||||
graphStackSize--;
|
||||
if (graphStackSize > 0)
|
||||
{
|
||||
int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize));
|
||||
memcpy(tempGraphStack, graphStack, sizeof(int32_t) * (graphStackSize));
|
||||
free(graphStack);
|
||||
graphStack = tempGraphStack;
|
||||
}
|
||||
|
||||
int8_t indicesSize = 5;
|
||||
int64_t *indices = (int64_t *)malloc(sizeof(int64_t) * indicesSize);
|
||||
for (int i = 0; i < indicesSize; i++)
|
||||
indices[i] = index - 1;
|
||||
|
||||
int8_t graphsSize = 5;
|
||||
int32_t *graphs = (int32_t *)malloc(sizeof(int32_t) * graphsSize);
|
||||
for (int i = 0; i < graphsSize; i++)
|
||||
graphs[i] = graphsSize - i - 1;
|
||||
|
||||
int64_t pow2indexInner = 1 << ((uint64_t)index);
|
||||
int64_t pow2indexInner_1 = 1 << ((uint64_t)index - 1);
|
||||
|
||||
if (graph == 0)
|
||||
{
|
||||
uint64_t sources = count - pow2indexInner;
|
||||
for (i = 0; i < pow2indexInner_1; i++)
|
||||
{
|
||||
uint8_t *parent0 = GetNode(g, sources + i);
|
||||
uint8_t *parent1 = GetNode(g, sources + i + pow2indexInner_1);
|
||||
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, count);
|
||||
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 3), parent1, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 4, hashOutput, NODE_SIZE);
|
||||
|
||||
NewNode(g, count, hashOutput);
|
||||
count++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(parent0);
|
||||
free(parent1);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
else if (graph == 1)
|
||||
{
|
||||
uint64_t firstXi = count;
|
||||
for (i = 0; i < pow2indexInner_1; i++)
|
||||
{
|
||||
uint64_t nodeId = firstXi + i;
|
||||
uint8_t *parent = GetNode(g, firstXi - pow2indexInner_1 + i);
|
||||
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, nodeId);
|
||||
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||
|
||||
NewNode(g, count, hashOutput);
|
||||
count++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(parent);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
else if (graph == 2)
|
||||
{
|
||||
uint64_t secondXi = count;
|
||||
for (i = 0; i < pow2indexInner_1; i++)
|
||||
{
|
||||
uint64_t nodeId = secondXi + i;
|
||||
uint8_t *parent = GetNode(g, secondXi - pow2indexInner_1 + i);
|
||||
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, nodeId);
|
||||
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||
|
||||
NewNode(g, count, hashOutput);
|
||||
count++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(parent);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
else if (graph == 3)
|
||||
{
|
||||
uint64_t secondButter = count;
|
||||
for (i = 0; i < pow2indexInner_1; i++)
|
||||
{
|
||||
uint64_t nodeId = secondButter + i;
|
||||
uint8_t *parent = GetNode(g, secondButter - pow2indexInner_1 + i);
|
||||
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, nodeId);
|
||||
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 3);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 3, hashOutput, NODE_SIZE);
|
||||
|
||||
NewNode(g, count, hashOutput);
|
||||
count++;
|
||||
|
||||
free(hashOutput);
|
||||
free(hashInput);
|
||||
free(parent);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint64_t sinks = count;
|
||||
uint64_t sources = sinks + pow2indexInner - numXi(index);
|
||||
for (i = 0; i < pow2indexInner_1; i++)
|
||||
{
|
||||
uint64_t nodeId0 = sinks + i;
|
||||
uint64_t nodeId1 = sinks + i + pow2indexInner_1;
|
||||
uint8_t *parent0 = GetNode(g, sinks - pow2indexInner_1 + i);
|
||||
uint8_t *parent1_0 = GetNode(g, sources + i);
|
||||
uint8_t *parent1_1 = GetNode(g, sources + i + pow2indexInner_1);
|
||||
|
||||
uint8_t *buf = (uint8_t *)malloc(NODE_SIZE);
|
||||
WriteVarInt(buf, nodeId0);
|
||||
|
||||
uint8_t *hashInput = (uint8_t *)malloc(NODE_SIZE * 4);
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 3), parent1_0, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput0 = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 4, hashOutput0, NODE_SIZE);
|
||||
|
||||
WriteVarInt(buf, nodeId1);
|
||||
|
||||
memcpy(hashInput, g->pk, NODE_SIZE);
|
||||
memcpy(hashInput + NODE_SIZE, buf, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 2), parent0, NODE_SIZE);
|
||||
memcpy(hashInput + (NODE_SIZE * 3), parent1_1, NODE_SIZE);
|
||||
|
||||
uint8_t *hashOutput1 = (uint8_t *)malloc(NODE_SIZE);
|
||||
sha3(hashInput, NODE_SIZE * 4, hashOutput1, NODE_SIZE);
|
||||
|
||||
NewNode(g, nodeId0, hashOutput0);
|
||||
NewNode(g, nodeId1, hashOutput1);
|
||||
count += 2;
|
||||
|
||||
free(parent0);
|
||||
free(parent1_0);
|
||||
free(parent1_1);
|
||||
free(buf);
|
||||
free(hashInput);
|
||||
free(hashOutput0);
|
||||
free(hashOutput1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((graph == 0 || graph == 3) ||
|
||||
((graph == 1 || graph == 2) && index == 2))
|
||||
{
|
||||
ButterflyGraph(g, index - 1, &count);
|
||||
}
|
||||
else if (graph == 1 || graph == 2)
|
||||
{
|
||||
|
||||
int64_t *tempStack = (int64_t *)malloc(sizeof(int64_t) * (stackSize + indicesSize));
|
||||
memcpy(tempStack, stack, stackSize * sizeof(int64_t));
|
||||
memcpy(tempStack + stackSize, indices, indicesSize * sizeof(int64_t));
|
||||
stackSize += indicesSize;
|
||||
free(stack);
|
||||
stack = tempStack;
|
||||
|
||||
int32_t *tempGraphStack = (int32_t *)malloc(sizeof(int32_t) * (graphStackSize + graphsSize));
|
||||
memcpy(tempGraphStack, graphStack, graphStackSize * sizeof(int32_t));
|
||||
memcpy(tempGraphStack + graphStackSize, graphs, graphsSize * sizeof(int32_t));
|
||||
graphStackSize += graphsSize;
|
||||
free(graphStack);
|
||||
graphStack = tempGraphStack;
|
||||
}
|
||||
|
||||
free(indices);
|
||||
free(graphs);
|
||||
}
|
||||
|
||||
free(stack);
|
||||
free(graphStack);
|
||||
}
|
||||
|
||||
struct Graph *NewGraph(int64_t index, const char* targetFile, uint8_t *pk)
|
||||
{
|
||||
uint8_t exists = 0;
|
||||
FILE *db;
|
||||
if ((db = fopen_utf8(targetFile, "r")) != NULL)
|
||||
{
|
||||
fclose(db);
|
||||
exists = 1;
|
||||
}
|
||||
|
||||
db = fopen_utf8(targetFile, "wb+");
|
||||
int64_t size = numXi(index);
|
||||
int64_t log2 = Log2(size) + 1;
|
||||
int64_t pow2 = 1 << ((uint64_t)log2);
|
||||
|
||||
struct Graph *g = (struct Graph *)malloc(sizeof(struct Graph));
|
||||
|
||||
if ( !g ) return NULL;
|
||||
|
||||
g->db = db;
|
||||
g->log2 = log2;
|
||||
g->pow2 = pow2;
|
||||
g->pk = pk;
|
||||
g->index = index;
|
||||
|
||||
if (exists == 0)
|
||||
{
|
||||
XiGraphIter(g, index);
|
||||
}
|
||||
|
||||
fclose(db);
|
||||
return g;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// use info for _mm_malloc, then verify file
|
||||
int verthash_generate_data_file(const char* output_file_name)
|
||||
{
|
||||
const char *hashInput = "Verthash Proof-of-Space Datafile";
|
||||
uint8_t *pk = (uint8_t*)malloc( NODE_SIZE );
|
||||
|
||||
if ( !pk )
|
||||
{
|
||||
applog( LOG_ERR, "Verthash data memory allocation failed");
|
||||
return -1;
|
||||
}
|
||||
|
||||
sha3( hashInput, 32, pk, NODE_SIZE );
|
||||
|
||||
int64_t index = 17;
|
||||
if ( !NewGraph( index, output_file_name, pk ) )
|
||||
{
|
||||
applog( LOG_ERR, "Verthash file creation failed");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
57
algo/verthash/Verthash.h
Normal file
57
algo/verthash/Verthash.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright 2018-2021 CryptoGraphics
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version. See LICENSE for more details.
|
||||
*/
|
||||
|
||||
#ifndef Verthash_INCLUDE_ONCE
|
||||
#define Verthash_INCLUDE_ONCE
|
||||
|
||||
#include "tiny_sha3/sha3.h"
|
||||
#include "fopen_utf8.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
// Verthash constants used to compute bitmask, used inside kernel during IO pass
|
||||
#define VH_HASH_OUT_SIZE 32
|
||||
#define VH_BYTE_ALIGNMENT 16
|
||||
#define VH_HEADER_SIZE 80
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Verthash data
|
||||
//! Verthash C api for data maniputation.
|
||||
typedef struct VerthashInfo
|
||||
{
|
||||
char* fileName;
|
||||
uint8_t* data;
|
||||
uint64_t dataSize;
|
||||
uint32_t bitmask;
|
||||
} verthash_info_t;
|
||||
|
||||
//! Must be called before usage. Reset all fields and set a mining data file name.
|
||||
//! Error codes
|
||||
//! 0 - Success(No error).
|
||||
//! 1 - File name is invalid.
|
||||
//! 2 - Memory allocation error
|
||||
int verthash_info_init(verthash_info_t* info, const char* file_name);
|
||||
|
||||
//! Reset all fields and free allocated data.
|
||||
void verthash_info_free(verthash_info_t* info);
|
||||
|
||||
//! Generate verthash data file and save it to specified location.
|
||||
int verthash_generate_data_file(const char* output_file_name);
|
||||
|
||||
void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||
const void *input, void *output );
|
||||
|
||||
void verthash_sha3_512_prehash_72( const void *input );
|
||||
void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );
|
||||
|
||||
#endif // !Verthash_INCLUDE_ONCE
|
||||
|
181
algo/verthash/fopen_utf8.c
Normal file
181
algo/verthash/fopen_utf8.c
Normal file
@@ -0,0 +1,181 @@
|
||||
#ifndef H_FOPEN_UTF8
|
||||
#define H_FOPEN_UTF8
|
||||
|
||||
#include "fopen_utf8.h"
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int utf8_char_size(const uint8_t *c)
|
||||
{
|
||||
const uint8_t m0x = 0x80, c0x = 0x00,
|
||||
m10x = 0xC0, c10x = 0x80,
|
||||
m110x = 0xE0, c110x = 0xC0,
|
||||
m1110x = 0xF0, c1110x = 0xE0,
|
||||
m11110x = 0xF8, c11110x = 0xF0;
|
||||
|
||||
if ((c[0] & m0x) == c0x)
|
||||
return 1;
|
||||
|
||||
if ((c[0] & m110x) == c110x)
|
||||
if ((c[1] & m10x) == c10x)
|
||||
return 2;
|
||||
|
||||
if ((c[0] & m1110x) == c1110x)
|
||||
if ((c[1] & m10x) == c10x)
|
||||
if ((c[2] & m10x) == c10x)
|
||||
return 3;
|
||||
|
||||
if ((c[0] & m11110x) == c11110x)
|
||||
if ((c[1] & m10x) == c10x)
|
||||
if ((c[2] & m10x) == c10x)
|
||||
if ((c[3] & m10x) == c10x)
|
||||
return 4;
|
||||
|
||||
if ((c[0] & m10x) == c10x) // not a first UTF-8 byte
|
||||
return 0;
|
||||
|
||||
return -1; // if c[0] is a first byte but the other bytes don't match
|
||||
}
|
||||
|
||||
uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index)
|
||||
{
|
||||
uint32_t v;
|
||||
int size;
|
||||
const uint8_t m6 = 63, m5 = 31, m4 = 15, m3 = 7;
|
||||
|
||||
if (c==NULL)
|
||||
return 0;
|
||||
|
||||
size = utf8_char_size(c);
|
||||
|
||||
if (size > 0 && index)
|
||||
*index += size-1;
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 1:
|
||||
v = c[0];
|
||||
break;
|
||||
case 2:
|
||||
v = c[0] & m5;
|
||||
v = v << 6 | (c[1] & m6);
|
||||
break;
|
||||
case 3:
|
||||
v = c[0] & m4;
|
||||
v = v << 6 | (c[1] & m6);
|
||||
v = v << 6 | (c[2] & m6);
|
||||
break;
|
||||
case 4:
|
||||
v = c[0] & m3;
|
||||
v = v << 6 | (c[1] & m6);
|
||||
v = v << 6 | (c[2] & m6);
|
||||
v = v << 6 | (c[3] & m6);
|
||||
break;
|
||||
case 0: // not a first UTF-8 byte
|
||||
case -1: // corrupt UTF-8 letter
|
||||
default:
|
||||
v = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
int codepoint_utf16_size(uint32_t c)
|
||||
{
|
||||
if (c < 0x10000) return 1;
|
||||
if (c < 0x110000) return 2;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint16_t *sprint_utf16(uint16_t *str, uint32_t c) // str must be able to hold 1 to 3 entries and will be null-terminated by this function
|
||||
{
|
||||
int c_size;
|
||||
|
||||
if (str==NULL)
|
||||
return NULL;
|
||||
|
||||
c_size = codepoint_utf16_size(c);
|
||||
|
||||
switch (c_size)
|
||||
{
|
||||
case 1:
|
||||
str[0] = c;
|
||||
if (c > 0)
|
||||
str[1] = '\0';
|
||||
break;
|
||||
|
||||
case 2:
|
||||
c -= 0x10000;
|
||||
str[0] = 0xD800 + (c >> 10);
|
||||
str[1] = 0xDC00 + (c & 0x3FF);
|
||||
str[2] = '\0';
|
||||
break;
|
||||
|
||||
default:
|
||||
str[0] = '\0';
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
size_t strlen_utf8_to_utf16(const uint8_t *str)
|
||||
{
|
||||
size_t i, count;
|
||||
uint32_t c;
|
||||
|
||||
for (i=0, count=0; ; i++)
|
||||
{
|
||||
if (str[i]==0)
|
||||
return count;
|
||||
|
||||
c = utf8_to_unicode32(&str[i], &i);
|
||||
count += codepoint_utf16_size(c);
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16)
|
||||
{
|
||||
size_t i, j;
|
||||
uint32_t c;
|
||||
|
||||
if (utf8==NULL)
|
||||
return NULL;
|
||||
|
||||
if (utf16==NULL)
|
||||
utf16 = (uint16_t *) calloc(strlen_utf8_to_utf16(utf8) + 1, sizeof(uint16_t));
|
||||
|
||||
for (i=0, j=0, c=1; c; i++)
|
||||
{
|
||||
c = utf8_to_unicode32(&utf8[i], &i);
|
||||
sprint_utf16(&utf16[j], c);
|
||||
j += codepoint_utf16_size(c);
|
||||
}
|
||||
|
||||
return utf16;
|
||||
}
|
||||
|
||||
FILE *fopen_utf8(const char *path, const char *mode)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
wchar_t *wpath, wmode[8];
|
||||
FILE *file;
|
||||
|
||||
if (utf8_to_utf16((const uint8_t *) mode, (uint16_t *) wmode)==NULL)
|
||||
return NULL;
|
||||
|
||||
wpath = (wchar_t *) utf8_to_utf16((const uint8_t *) path, NULL);
|
||||
if (wpath==NULL)
|
||||
return NULL;
|
||||
|
||||
file = _wfopen(wpath, wmode);
|
||||
free(wpath);
|
||||
return file;
|
||||
#else
|
||||
return fopen(path, mode);
|
||||
#endif
|
||||
}
|
||||
#endif
|
25
algo/verthash/fopen_utf8.h
Normal file
25
algo/verthash/fopen_utf8.h
Normal file
@@ -0,0 +1,25 @@
|
||||
#ifndef H_FOPEN_UTF8
|
||||
#define H_FOPEN_UTF8
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
int utf8_char_size(const uint8_t *c);
|
||||
uint32_t utf8_to_unicode32(const uint8_t *c, size_t *index);
|
||||
int codepoint_utf16_size(uint32_t c);
|
||||
uint16_t *sprint_utf16(uint16_t *str, uint32_t c);
|
||||
size_t strlen_utf8_to_utf16(const uint8_t *str);
|
||||
uint16_t *utf8_to_utf16(const uint8_t *utf8, uint16_t *utf16);
|
||||
|
||||
FILE *fopen_utf8(const char *path, const char *mode);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
301
algo/verthash/tiny_sha3/sha3-4way.c
Normal file
301
algo/verthash/tiny_sha3/sha3-4way.c
Normal file
@@ -0,0 +1,301 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// sha3-4way.c
|
||||
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||
// vectorization by JayDDee 2021-03-27
|
||||
//
|
||||
// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
|
||||
// Revised 03-Sep-15 for portability + OpenSSL - style API
|
||||
|
||||
#include "sha3-4way.h"
|
||||
|
||||
// constants
|
||||
static const uint64_t keccakf_rndc[24] = {
|
||||
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
|
||||
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
|
||||
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
|
||||
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
|
||||
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
|
||||
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
|
||||
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
|
||||
};
|
||||
|
||||
void sha3_4way_keccakf( __m256i st[25] )
|
||||
{
|
||||
int i, j, r;
|
||||
__m256i t, bc[5];
|
||||
|
||||
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
|
||||
{
|
||||
// Theta
|
||||
bc[0] = _mm256_xor_si256( st[0],
|
||||
mm256_xor4( st[5], st[10], st[15], st[20] ) );
|
||||
bc[1] = _mm256_xor_si256( st[1],
|
||||
mm256_xor4( st[6], st[11], st[16], st[21] ) );
|
||||
bc[2] = _mm256_xor_si256( st[2],
|
||||
mm256_xor4( st[7], st[12], st[17], st[22] ) );
|
||||
bc[3] = _mm256_xor_si256( st[3],
|
||||
mm256_xor4( st[8], st[13], st[18], st[23] ) );
|
||||
bc[4] = _mm256_xor_si256( st[4],
|
||||
mm256_xor4( st[9], st[14], st[19], st[24] ) );
|
||||
|
||||
for ( i = 0; i < 5; i++ )
|
||||
{
|
||||
t = _mm256_xor_si256( bc[ (i+4) % 5 ],
|
||||
mm256_rol_64( bc[ (i+1) % 5 ], 1 ) );
|
||||
st[ i ] = _mm256_xor_si256( st[ i ], t );
|
||||
st[ i+5 ] = _mm256_xor_si256( st[ i+ 5 ], t );
|
||||
st[ i+10 ] = _mm256_xor_si256( st[ i+10 ], t );
|
||||
st[ i+15 ] = _mm256_xor_si256( st[ i+15 ], t );
|
||||
st[ i+20 ] = _mm256_xor_si256( st[ i+20 ], t );
|
||||
}
|
||||
|
||||
// Rho Pi
|
||||
#define RHO_PI( i, c ) \
|
||||
bc[0] = st[ i ]; \
|
||||
st[ i ] = mm256_rol_64( t, c ); \
|
||||
t = bc[0]
|
||||
|
||||
t = st[1];
|
||||
|
||||
RHO_PI( 10, 1 );
|
||||
RHO_PI( 7, 3 );
|
||||
RHO_PI( 11, 6 );
|
||||
RHO_PI( 17, 10 );
|
||||
RHO_PI( 18, 15 );
|
||||
RHO_PI( 3, 21 );
|
||||
RHO_PI( 5, 28 );
|
||||
RHO_PI( 16, 36 );
|
||||
RHO_PI( 8, 45 );
|
||||
RHO_PI( 21, 55 );
|
||||
RHO_PI( 24, 2 );
|
||||
RHO_PI( 4, 14 );
|
||||
RHO_PI( 15, 27 );
|
||||
RHO_PI( 23, 41 );
|
||||
RHO_PI( 19, 56 );
|
||||
RHO_PI( 13, 8 );
|
||||
RHO_PI( 12, 25 );
|
||||
RHO_PI( 2, 43 );
|
||||
RHO_PI( 20, 62 );
|
||||
RHO_PI( 14, 18 );
|
||||
RHO_PI( 22, 39 );
|
||||
RHO_PI( 9, 61 );
|
||||
RHO_PI( 6, 20 );
|
||||
RHO_PI( 1, 44 );
|
||||
|
||||
#undef RHO_PI
|
||||
|
||||
// Chi
|
||||
for ( j = 0; j < 25; j += 5 )
|
||||
{
|
||||
memcpy( bc, &st[ j ], 5*32 );
|
||||
st[ j ] = _mm256_xor_si256( st[ j ],
|
||||
_mm256_andnot_si256( bc[1], bc[2] ) );
|
||||
st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
|
||||
_mm256_andnot_si256( bc[2], bc[3] ) );
|
||||
st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
|
||||
_mm256_andnot_si256( bc[3], bc[4] ) );
|
||||
st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
|
||||
_mm256_andnot_si256( bc[4], bc[0] ) );
|
||||
st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
|
||||
_mm256_andnot_si256( bc[0], bc[1] ) );
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] = _mm256_xor_si256( st[0],
|
||||
_mm256_set1_epi64x( keccakf_rndc[ r ] ) );
|
||||
}
|
||||
}
|
||||
|
||||
int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen )
|
||||
{
|
||||
for ( int i = 0; i < 25; i++ ) c->st[ i ] = m256_zero;
|
||||
c->mdlen = mdlen;
|
||||
c->rsiz = 200 - 2 * mdlen;
|
||||
c->pt = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
|
||||
{
|
||||
size_t i;
|
||||
int j = c->pt;
|
||||
const int rsiz = c->rsiz / 8;
|
||||
const int l = len / 8;
|
||||
|
||||
for ( i = 0; i < l; i++ )
|
||||
{
|
||||
c->st[ j ] = _mm256_xor_si256( c->st[ j ],
|
||||
( (const __m256i*)data )[i] );
|
||||
j++;
|
||||
if ( j >= rsiz )
|
||||
{
|
||||
sha3_4way_keccakf( c->st );
|
||||
j = 0;
|
||||
}
|
||||
}
|
||||
c->pt = j;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
|
||||
{
|
||||
c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
|
||||
m256_const1_64( 6 ) );
|
||||
c->st[ c->rsiz / 8 - 1 ] =
|
||||
_mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
|
||||
m256_const1_64( 0x8000000000000000 ) );
|
||||
sha3_4way_keccakf( c->st );
|
||||
memcpy( md, c->st, c->mdlen * 4 );
|
||||
return 1;
|
||||
}
|
||||
|
||||
void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
|
||||
{
|
||||
sha3_4way_ctx_t ctx;
|
||||
sha3_4way_init( &ctx, mdlen);
|
||||
sha3_4way_update( &ctx, in, inlen );
|
||||
sha3_4way_final( md, &ctx );
|
||||
return md;
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void sha3_8way_keccakf( __m512i st[25] )
|
||||
{
|
||||
int i, j, r;
|
||||
__m512i t, bc[5];
|
||||
|
||||
// actual iteration
|
||||
for ( r = 0; r < KECCAKF_ROUNDS; r++ )
|
||||
{
|
||||
|
||||
// Theta
|
||||
for ( i = 0; i < 5; i++ )
|
||||
bc[i] = _mm512_xor_si512( st[i],
|
||||
mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) );
|
||||
|
||||
for ( i = 0; i < 5; i++ )
|
||||
{
|
||||
t = _mm512_xor_si512( bc[(i + 4) % 5],
|
||||
_mm512_rol_epi64( bc[(i + 1) % 5], 1 ) );
|
||||
for ( j = 0; j < 25; j += 5 )
|
||||
st[j + i] = _mm512_xor_si512( st[j + i], t );
|
||||
}
|
||||
|
||||
// Rho Pi
|
||||
#define RHO_PI( i, c ) \
|
||||
bc[0] = st[ i ]; \
|
||||
st[ i ] = _mm512_rol_epi64( t, c ); \
|
||||
t = bc[0]
|
||||
|
||||
t = st[1];
|
||||
|
||||
RHO_PI( 10, 1 );
|
||||
RHO_PI( 7, 3 );
|
||||
RHO_PI( 11, 6 );
|
||||
RHO_PI( 17, 10 );
|
||||
RHO_PI( 18, 15 );
|
||||
RHO_PI( 3, 21 );
|
||||
RHO_PI( 5, 28 );
|
||||
RHO_PI( 16, 36 );
|
||||
RHO_PI( 8, 45 );
|
||||
RHO_PI( 21, 55 );
|
||||
RHO_PI( 24, 2 );
|
||||
RHO_PI( 4, 14 );
|
||||
RHO_PI( 15, 27 );
|
||||
RHO_PI( 23, 41 );
|
||||
RHO_PI( 19, 56 );
|
||||
RHO_PI( 13, 8 );
|
||||
RHO_PI( 12, 25 );
|
||||
RHO_PI( 2, 43 );
|
||||
RHO_PI( 20, 62 );
|
||||
RHO_PI( 14, 18 );
|
||||
RHO_PI( 22, 39 );
|
||||
RHO_PI( 9, 61 );
|
||||
RHO_PI( 6, 20 );
|
||||
RHO_PI( 1, 44 );
|
||||
|
||||
#undef RHO_PI
|
||||
|
||||
// Chi
|
||||
for ( j = 0; j < 25; j += 5 )
|
||||
{
|
||||
for ( i = 0; i < 5; i++ )
|
||||
bc[i] = st[j + i];
|
||||
for ( i = 0; i < 5; i++ )
|
||||
st[ j+i ] = _mm512_xor_si512( st[ j+i ], _mm512_andnot_si512(
|
||||
bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) );
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) );
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the context for SHA3
|
||||
|
||||
int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen )
|
||||
{
|
||||
for ( int i = 0; i < 25; i++ ) c->st[ i ] = m512_zero;
|
||||
c->mdlen = mdlen;
|
||||
c->rsiz = 200 - 2 * mdlen;
|
||||
c->pt = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// update state with more data
|
||||
|
||||
int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len )
|
||||
{
|
||||
size_t i;
|
||||
int j = c->pt;
|
||||
const int rsiz = c->rsiz / 8;
|
||||
const int l = len / 8;
|
||||
|
||||
for ( i = 0; i < l; i++ )
|
||||
{
|
||||
c->st[ j ] = _mm512_xor_si512( c->st[ j ],
|
||||
( (const __m512i*)data )[i] );
|
||||
j++;
|
||||
if ( j >= rsiz )
|
||||
{
|
||||
sha3_8way_keccakf( c->st );
|
||||
j = 0;
|
||||
}
|
||||
}
|
||||
c->pt = j;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// finalize and output a hash
|
||||
|
||||
int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
|
||||
{
|
||||
c->st[ c->pt ] =
|
||||
_mm512_xor_si512( c->st[ c->pt ],
|
||||
m512_const1_64( 6 ) );
|
||||
c->st[ c->rsiz / 8 - 1 ] =
|
||||
_mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
|
||||
m512_const1_64( 0x8000000000000000 ) );
|
||||
sha3_8way_keccakf( c->st );
|
||||
memcpy( md, c->st, c->mdlen * 8 );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// compute a SHA-3 hash (md) of given byte length from "in"
|
||||
|
||||
void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen )
|
||||
{
|
||||
sha3_8way_ctx_t sha3;
|
||||
sha3_8way_init( &sha3, mdlen);
|
||||
sha3_8way_update( &sha3, in, inlen );
|
||||
sha3_8way_final( md, &sha3 );
|
||||
return md;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
67
algo/verthash/tiny_sha3/sha3-4way.h
Normal file
67
algo/verthash/tiny_sha3/sha3-4way.h
Normal file
@@ -0,0 +1,67 @@
|
||||
// sha3.h
|
||||
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||
// 2021-03-27 JayDDee
|
||||
//
|
||||
#ifndef SHA3_4WAY_H
|
||||
#define SHA3_4WAY_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef KECCAKF_ROUNDS
|
||||
#define KECCAKF_ROUNDS 24
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i st[25]; // 64-bit words * 4 lanes
|
||||
int pt, rsiz, mdlen; // these don't overflow
|
||||
} sha3_4way_ctx_t __attribute__ ((aligned (64)));;
|
||||
|
||||
// Compression function.
|
||||
void sha3_4way_keccakf( __m256i st[25] );
|
||||
|
||||
// OpenSSL - like interfece
|
||||
int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes
|
||||
int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len );
|
||||
int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md
|
||||
|
||||
// compute a sha3 hash (md) of given byte length from "in"
|
||||
void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// state context
|
||||
typedef struct
|
||||
{
|
||||
__m512i st[25]; // 64-bit words * 8 lanes
|
||||
int pt, rsiz, mdlen; // these don't overflow
|
||||
} sha3_8way_ctx_t __attribute__ ((aligned (64)));;
|
||||
|
||||
// Compression function.
|
||||
void sha3_8way_keccakf( __m512i st[25] );
|
||||
|
||||
// OpenSSL - like interfece
|
||||
int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen ); // mdlen = hash output in bytes
|
||||
int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len );
|
||||
int sha3_8way_final( void *md, sha3_8way_ctx_t *c ); // digest goes to md
|
||||
|
||||
// compute a sha3 hash (md) of given byte length from "in"
|
||||
void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
226
algo/verthash/tiny_sha3/sha3.c
Normal file
226
algo/verthash/tiny_sha3/sha3.c
Normal file
@@ -0,0 +1,226 @@
|
||||
// sha3.c
|
||||
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||
|
||||
// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
|
||||
// Revised 03-Sep-15 for portability + OpenSSL - style API
|
||||
|
||||
#include "sha3.h"
|
||||
#include <string.h>
|
||||
|
||||
// update the state with given number of rounds
|
||||
|
||||
void sha3_keccakf(uint64_t st[25])
|
||||
{
|
||||
// constants
|
||||
const uint64_t keccakf_rndc[24] = {
|
||||
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
|
||||
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
|
||||
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
|
||||
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
|
||||
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
|
||||
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
|
||||
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
|
||||
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
|
||||
};
|
||||
/*
|
||||
const int keccakf_rotc[24] = {
|
||||
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
|
||||
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
|
||||
};
|
||||
const int keccakf_piln[24] = {
|
||||
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
|
||||
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
|
||||
};
|
||||
*/
|
||||
|
||||
// variables
|
||||
int i, j, r;
|
||||
uint64_t t, bc[5];
|
||||
|
||||
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
|
||||
uint8_t *v;
|
||||
|
||||
// endianess conversion. this is redundant on little-endian targets
|
||||
for (i = 0; i < 25; i++) {
|
||||
v = (uint8_t *) &st[i];
|
||||
st[i] = ((uint64_t) v[0]) | (((uint64_t) v[1]) << 8) |
|
||||
(((uint64_t) v[2]) << 16) | (((uint64_t) v[3]) << 24) |
|
||||
(((uint64_t) v[4]) << 32) | (((uint64_t) v[5]) << 40) |
|
||||
(((uint64_t) v[6]) << 48) | (((uint64_t) v[7]) << 56);
|
||||
}
|
||||
#endif
|
||||
|
||||
// actual iteration
|
||||
for (r = 0; r < KECCAKF_ROUNDS; r++) {
|
||||
|
||||
// Theta
|
||||
for (i = 0; i < 5; i++)
|
||||
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
|
||||
|
||||
for (i = 0; i < 5; i++) {
|
||||
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
|
||||
for (j = 0; j < 25; j += 5)
|
||||
st[j + i] ^= t;
|
||||
}
|
||||
|
||||
|
||||
// Rho Pi
|
||||
#define RHO_PI( i, c ) \
|
||||
bc[0] = st[ i ]; \
|
||||
st[ i ] = ROTL64( t, c ); \
|
||||
t = bc[0]
|
||||
|
||||
t = st[1];
|
||||
|
||||
RHO_PI( 10, 1 );
|
||||
RHO_PI( 7, 3 );
|
||||
RHO_PI( 11, 6 );
|
||||
RHO_PI( 17, 10 );
|
||||
RHO_PI( 18, 15 );
|
||||
RHO_PI( 3, 21 );
|
||||
RHO_PI( 5, 28 );
|
||||
RHO_PI( 16, 36 );
|
||||
RHO_PI( 8, 45 );
|
||||
RHO_PI( 21, 55 );
|
||||
RHO_PI( 24, 2 );
|
||||
RHO_PI( 4, 14 );
|
||||
RHO_PI( 15, 27 );
|
||||
RHO_PI( 23, 41 );
|
||||
RHO_PI( 19, 56 );
|
||||
RHO_PI( 13, 8 );
|
||||
RHO_PI( 12, 25 );
|
||||
RHO_PI( 2, 43 );
|
||||
RHO_PI( 20, 62 );
|
||||
RHO_PI( 14, 18 );
|
||||
RHO_PI( 22, 39 );
|
||||
RHO_PI( 9, 61 );
|
||||
RHO_PI( 6, 20 );
|
||||
RHO_PI( 1, 44 );
|
||||
|
||||
#undef RHO_PI
|
||||
|
||||
/*
|
||||
for (i = 0; i < 24; i++) {
|
||||
j = keccakf_piln[i];
|
||||
bc[0] = st[j];
|
||||
st[j] = ROTL64(t, keccakf_rotc[i]);
|
||||
t = bc[0];
|
||||
}
|
||||
*/
|
||||
|
||||
// Chi
|
||||
for (j = 0; j < 25; j += 5) {
|
||||
for (i = 0; i < 5; i++)
|
||||
bc[i] = st[j + i];
|
||||
for (i = 0; i < 5; i++)
|
||||
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] ^= keccakf_rndc[r];
|
||||
}
|
||||
|
||||
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
|
||||
// endianess conversion. this is redundant on little-endian targets
|
||||
for (i = 0; i < 25; i++) {
|
||||
v = (uint8_t *) &st[i];
|
||||
t = st[i];
|
||||
v[0] = t & 0xFF;
|
||||
v[1] = (t >> 8) & 0xFF;
|
||||
v[2] = (t >> 16) & 0xFF;
|
||||
v[3] = (t >> 24) & 0xFF;
|
||||
v[4] = (t >> 32) & 0xFF;
|
||||
v[5] = (t >> 40) & 0xFF;
|
||||
v[6] = (t >> 48) & 0xFF;
|
||||
v[7] = (t >> 56) & 0xFF;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Initialize the context for SHA3
|
||||
|
||||
int sha3_init(sha3_ctx_t *c, int mdlen)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 25; i++)
|
||||
c->st.q[i] = 0;
|
||||
c->mdlen = mdlen;
|
||||
c->rsiz = 200 - 2 * mdlen;
|
||||
c->pt = 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// update state with more data
|
||||
|
||||
int sha3_update(sha3_ctx_t *c, const void *data, size_t len)
|
||||
{
|
||||
size_t i;
|
||||
int j = c->pt / 8;
|
||||
const int rsiz = c->rsiz / 8;
|
||||
const int l = len / 8;
|
||||
|
||||
for ( i = 0; i < l; i++ )
|
||||
{
|
||||
c->st.q[ j++ ] ^= ( ((const uint64_t *) data) [i] );
|
||||
if ( j >= rsiz )
|
||||
{
|
||||
sha3_keccakf( c->st.q );
|
||||
j = 0;
|
||||
}
|
||||
}
|
||||
c->pt = j*8;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// finalize and output a hash
|
||||
|
||||
int sha3_final(void *md, sha3_ctx_t *c)
|
||||
{
|
||||
c->st.q[ c->pt / 8 ] ^= 6;
|
||||
c->st.q[ c->rsiz / 8 - 1 ] ^= 0x8000000000000000;
|
||||
sha3_keccakf(c->st.q);
|
||||
memcpy( md, c->st.q, c->mdlen );
|
||||
return 1;
|
||||
}
|
||||
|
||||
// compute a SHA-3 hash (md) of given byte length from "in"
|
||||
|
||||
void *sha3(const void *in, size_t inlen, void *md, int mdlen)
|
||||
{
|
||||
sha3_ctx_t sha3;
|
||||
sha3_init(&sha3, mdlen);
|
||||
sha3_update(&sha3, in, inlen);
|
||||
sha3_final(md, &sha3);
|
||||
|
||||
return md;
|
||||
}
|
||||
|
||||
// SHAKE128 and SHAKE256 extensible-output functionality
|
||||
|
||||
void shake_xof(sha3_ctx_t *c)
|
||||
{
|
||||
c->st.b[c->pt] ^= 0x1F;
|
||||
c->st.b[c->rsiz - 1] ^= 0x80;
|
||||
sha3_keccakf(c->st.q);
|
||||
c->pt = 0;
|
||||
}
|
||||
|
||||
void shake_out(sha3_ctx_t *c, void *out, size_t len)
|
||||
{
|
||||
size_t i;
|
||||
int j;
|
||||
|
||||
j = c->pt;
|
||||
for (i = 0; i < len; i++) {
|
||||
if (j >= c->rsiz) {
|
||||
sha3_keccakf(c->st.q);
|
||||
j = 0;
|
||||
}
|
||||
((uint8_t *) out)[i] = c->st.b[j++];
|
||||
}
|
||||
c->pt = j;
|
||||
}
|
||||
|
55
algo/verthash/tiny_sha3/sha3.h
Normal file
55
algo/verthash/tiny_sha3/sha3.h
Normal file
@@ -0,0 +1,55 @@
|
||||
// sha3.h
|
||||
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
|
||||
|
||||
#ifndef SHA3_H
|
||||
#define SHA3_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef KECCAKF_ROUNDS
|
||||
#define KECCAKF_ROUNDS 24
|
||||
#endif
|
||||
|
||||
#ifndef ROTL64
|
||||
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
|
||||
#endif
|
||||
|
||||
// state context
|
||||
typedef struct {
|
||||
union { // state:
|
||||
uint8_t b[200]; // 8-bit bytes
|
||||
uint64_t q[25]; // 64-bit words
|
||||
} st;
|
||||
int pt, rsiz, mdlen; // these don't overflow
|
||||
} sha3_ctx_t;
|
||||
|
||||
// Compression function.
|
||||
void sha3_keccakf(uint64_t st[25]);
|
||||
|
||||
// OpenSSL - like interfece
|
||||
int sha3_init(sha3_ctx_t *c, int mdlen); // mdlen = hash output in bytes
|
||||
int sha3_update(sha3_ctx_t *c, const void *data, size_t len);
|
||||
int sha3_final(void *md, sha3_ctx_t *c); // digest goes to md
|
||||
|
||||
// compute a sha3 hash (md) of given byte length from "in"
|
||||
void *sha3(const void *in, size_t inlen, void *md, int mdlen);
|
||||
|
||||
// SHAKE128 and SHAKE256 extensible-output functions
|
||||
#define shake128_init(c) sha3_init(c, 16)
|
||||
#define shake256_init(c) sha3_init(c, 32)
|
||||
#define shake_update sha3_update
|
||||
|
||||
void shake_xof(sha3_ctx_t *c);
|
||||
void shake_out(sha3_ctx_t *c, void *out, size_t len);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
176
algo/verthash/verthash-gate.c
Normal file
176
algo/verthash/verthash-gate.c
Normal file
@@ -0,0 +1,176 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "Verthash.h"
|
||||
#include "tiny_sha3/sha3-4way.h"
|
||||
|
||||
static verthash_info_t verthashInfo;
|
||||
|
||||
// Verthash data file hash in bytes for verification
|
||||
// 0x48aa21d7afededb63976d48a8ff8ec29d5b02563af4a1110b056cd43e83155a5
|
||||
static const uint8_t verthashDatFileHash_bytes[32] =
|
||||
{ 0xa5, 0x55, 0x31, 0xe8, 0x43, 0xcd, 0x56, 0xb0,
|
||||
0x10, 0x11, 0x4a, 0xaf, 0x63, 0x25, 0xb0, 0xd5,
|
||||
0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39,
|
||||
0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 };
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static __thread sha3_4way_ctx_t sha3_mid_ctxA;
|
||||
static __thread sha3_4way_ctx_t sha3_mid_ctxB;
|
||||
|
||||
#else
|
||||
|
||||
static __thread sha3_ctx_t sha3_mid_ctx[8];
|
||||
|
||||
#endif
|
||||
|
||||
void verthash_sha3_512_prehash_72( const void *input )
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
|
||||
__m256i vin[10];
|
||||
mm256_intrlv80_4x64( vin, input );
|
||||
|
||||
sha3_4way_init( &sha3_mid_ctxA, 64 );
|
||||
sha3_4way_init( &sha3_mid_ctxB, 64 );
|
||||
|
||||
vin[0] = _mm256_add_epi8( vin[0], _mm256_set_epi64x( 4,3,2,1 ) );
|
||||
sha3_4way_update( &sha3_mid_ctxA, vin, 72 );
|
||||
|
||||
vin[0] = _mm256_add_epi8( vin[0], _mm256_set1_epi64x( 4 ) );
|
||||
sha3_4way_update( &sha3_mid_ctxB, vin, 72 );
|
||||
|
||||
#else
|
||||
|
||||
char in[80] __attribute__ ((aligned (64)));
|
||||
memcpy( in, input, 80 );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
in[0] += 1;
|
||||
sha3_init( &sha3_mid_ctx[i], 64 );
|
||||
sha3_update( &sha3_mid_ctx[i], in, 72 );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
|
||||
{
|
||||
#if defined(__AVX2__)
|
||||
|
||||
__m256i vhashA[ 10 ] __attribute__ ((aligned (64)));
|
||||
__m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
|
||||
|
||||
sha3_4way_ctx_t ctx;
|
||||
const __m256i vnonce = _mm256_set1_epi64x( nonce );
|
||||
|
||||
memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
|
||||
sha3_4way_update( &ctx, &vnonce, 8 );
|
||||
sha3_4way_final( vhashA, &ctx );
|
||||
|
||||
memcpy( &ctx, &sha3_mid_ctxB, sizeof ctx );
|
||||
sha3_4way_update( &ctx, &vnonce, 8 );
|
||||
sha3_4way_final( vhashB, &ctx );
|
||||
|
||||
dintrlv_4x64( hash, hash+64, hash+128, hash+192, vhashA, 512 );
|
||||
dintrlv_4x64( hash+256, hash+320, hash+384, hash+448, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
sha3_ctx_t ctx;
|
||||
memcpy( &ctx, &sha3_mid_ctx[i], sizeof ctx );
|
||||
sha3_update( &ctx, &nonce, 8 );
|
||||
sha3_final( hash + i*64, &ctx );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
int scanhash_verthash( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t edata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash[8] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
verthash_sha3_512_prehash_72( edata );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = n;
|
||||
verthash_hash( verthashInfo.data, verthashInfo.dataSize,
|
||||
edata, hash );
|
||||
if ( valid_hash( hash, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char *default_verthash_data_file = "verthash.dat";
|
||||
|
||||
bool register_verthash_algo( algo_gate_t* gate )
|
||||
{
|
||||
opt_target_factor = 256.0;
|
||||
gate->scanhash = (void*)&scanhash_verthash;
|
||||
gate->optimizations = AVX2_OPT;
|
||||
|
||||
const char *verthash_data_file = opt_data_file ? opt_data_file
|
||||
: default_verthash_data_file;
|
||||
|
||||
int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
|
||||
if (vhLoadResult == 0) // No Error
|
||||
{
|
||||
if ( opt_verify )
|
||||
{
|
||||
uint8_t vhDataFileHash[32] = { 0 };
|
||||
|
||||
applog( LOG_NOTICE, "Verifying Verthash data" );
|
||||
sph_sha256_full( vhDataFileHash, verthashInfo.data,
|
||||
verthashInfo.dataSize );
|
||||
if ( memcmp( vhDataFileHash, verthashDatFileHash_bytes,
|
||||
sizeof(verthashDatFileHash_bytes) ) == 0 )
|
||||
applog( LOG_NOTICE, "Verthash data has been verified" );
|
||||
else
|
||||
{
|
||||
applog( LOG_ERR, "Verthash data verification has failed" );
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Handle Verthash error codes
|
||||
if ( vhLoadResult == 1 )
|
||||
{
|
||||
applog( LOG_ERR, "Verthash data file not found: %s",
|
||||
verthash_data_file );
|
||||
if ( !opt_data_file )
|
||||
applog( LOG_NOTICE, "Add '--verify' to create verthash.dat");
|
||||
}
|
||||
else if ( vhLoadResult == 2 )
|
||||
applog( LOG_ERR, "Failed to allocate memory for Verthash data" );
|
||||
// else // for debugging purposes
|
||||
// applog( LOG_ERR, "Verthash data initialization unknown error code: %d",
|
||||
// vhLoadResult );
|
||||
return false;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
return true;
|
||||
}
|
||||
|
@@ -373,6 +373,45 @@ static inline void salsa20(salsa20_blk_t *restrict B,
|
||||
#define INTEGERIFY (uint32_t)X.d[0]
|
||||
#endif
|
||||
|
||||
// AVX512 ternary logic optimization
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define XOR_X_XOR_X( in1, in2 ) \
|
||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 );
|
||||
|
||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
|
||||
|
||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
|
||||
SALSA20(out)
|
||||
|
||||
#else
|
||||
|
||||
#define XOR_X_XOR_X( in1, in2 ) \
|
||||
XOR_X( in1 ) \
|
||||
XOR_X( in2 )
|
||||
|
||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||
XOR_X_2( in1, in2 ) \
|
||||
XOR_X( in3 )
|
||||
|
||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||
XOR_X(in1) \
|
||||
XOR_X(in2) \
|
||||
SALSA20( out )
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Apply the Salsa20 core to the block provided in X ^ in.
|
||||
*/
|
||||
@@ -406,11 +445,15 @@ static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
|
||||
{
|
||||
DECL_X
|
||||
|
||||
XOR_X_2(Bin1[1], Bin2[1])
|
||||
XOR_X(Bin1[0])
|
||||
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )
|
||||
// XOR_X_2(Bin1[1], Bin2[1])
|
||||
// XOR_X(Bin1[0])
|
||||
SALSA20_XOR_MEM(Bin2[0], Bout[0])
|
||||
XOR_X(Bin1[1])
|
||||
SALSA20_XOR_MEM(Bin2[1], Bout[1])
|
||||
|
||||
// Factor out the XOR from salsa20 to do a xor3
|
||||
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
|
||||
// XOR_X(Bin1[1])
|
||||
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
|
||||
|
||||
return INTEGERIFY;
|
||||
}
|
||||
@@ -745,13 +788,15 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
||||
i = 0;
|
||||
r--;
|
||||
do {
|
||||
XOR_X(Bin1[i])
|
||||
XOR_X(Bin2[i])
|
||||
XOR_X_XOR_X( Bin1[i], Bin2[i] )
|
||||
// XOR_X(Bin1[i])
|
||||
// XOR_X(Bin2[i])
|
||||
PWXFORM
|
||||
WRITE_X(Bout[i])
|
||||
|
||||
XOR_X(Bin1[i + 1])
|
||||
XOR_X(Bin2[i + 1])
|
||||
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
|
||||
// XOR_X(Bin1[i + 1])
|
||||
// XOR_X(Bin2[i + 1])
|
||||
PWXFORM
|
||||
|
||||
if (unlikely(i >= r))
|
||||
|
@@ -35,7 +35,6 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@@ -63,7 +63,7 @@ mv cpuminer cpuminer-avx
|
||||
# Westmere SSE4.2 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-sse42.exe
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.7.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.17.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.15.7'
|
||||
PACKAGE_STRING='cpuminer-opt 3.15.7'
|
||||
PACKAGE_VERSION='3.17.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.17.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.15.7 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.17.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.15.7:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.17.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.15.7
|
||||
cpuminer-opt configure 3.17.1
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.15.7, which was
|
||||
It was created by cpuminer-opt $as_me 3.17.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.15.7'
|
||||
VERSION='3.17.1'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.15.7, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.17.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.15.7
|
||||
cpuminer-opt config.status 3.17.1
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.15.7])
|
||||
AC_INIT([cpuminer-opt], [3.17.1])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
567
cpu-miner.c
567
cpu-miner.c
@@ -112,21 +112,20 @@ char* opt_param_key = NULL;
|
||||
int opt_param_n = 0;
|
||||
int opt_param_r = 0;
|
||||
int opt_n_threads = 0;
|
||||
bool opt_reset_on_stale = false;
|
||||
bool opt_sapling = false;
|
||||
|
||||
// Windows doesn't support 128 bit affinity mask.
|
||||
// Need compile time and run time test.
|
||||
#if defined(__linux) && defined(GCC_INT128)
|
||||
#define AFFINITY_USES_UINT128 1
|
||||
uint128_t opt_affinity = -1;
|
||||
static uint128_t opt_affinity = -1;
|
||||
static bool affinity_uses_uint128 = true;
|
||||
#else
|
||||
uint64_t opt_affinity = -1;
|
||||
static uint64_t opt_affinity = -1;
|
||||
static bool affinity_uses_uint128 = false;
|
||||
#endif
|
||||
|
||||
int opt_priority = 0;
|
||||
int opt_priority = 0; // deprecated
|
||||
int num_cpus = 1;
|
||||
int num_cpugroups = 1;
|
||||
char *rpc_url = NULL;;
|
||||
@@ -134,6 +133,8 @@ char *rpc_userpass = NULL;
|
||||
char *rpc_user, *rpc_pass;
|
||||
char *short_url = NULL;
|
||||
char *coinbase_address;
|
||||
char *opt_data_file = NULL;
|
||||
bool opt_verify = false;
|
||||
|
||||
// pk_buffer_size is used as a version selector by b58 code, therefore
|
||||
// it must be set correctly to work.
|
||||
@@ -446,8 +447,10 @@ static bool work_decode( const json_t *val, struct work *work )
|
||||
|
||||
if ( !allow_mininginfo )
|
||||
net_diff = algo_gate.calc_network_diff( work );
|
||||
else
|
||||
net_diff = hash_to_diff( work->target );
|
||||
|
||||
work->targetdiff = hash_to_diff( work->target );
|
||||
work->targetdiff = net_diff;
|
||||
stratum_diff = last_targetdiff = work->targetdiff;
|
||||
work->sharediff = 0;
|
||||
algo_gate.decode_extra_data( work, &net_blocks );
|
||||
@@ -481,13 +484,17 @@ static bool get_mininginfo( CURL *curl, struct work *work )
|
||||
// "networkhashps": 56475980
|
||||
if ( res )
|
||||
{
|
||||
// net_diff is a global that is set from the work hash target by
|
||||
// both getwork and GBT. Don't overwrite it, define a local to override
|
||||
// the global.
|
||||
double net_diff = 0.;
|
||||
json_t *key = json_object_get( res, "difficulty" );
|
||||
if ( key )
|
||||
{
|
||||
if ( json_is_object( key ) )
|
||||
key = json_object_get( key, "proof-of-work" );
|
||||
if ( json_is_real( key ) )
|
||||
net_diff = work->targetdiff = json_real_value( key );
|
||||
net_diff = json_real_value( key );
|
||||
}
|
||||
|
||||
key = json_object_get( res, "networkhashps" );
|
||||
@@ -554,7 +561,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
if ( !s )
|
||||
continue;
|
||||
if ( !strcmp( s, "segwit" ) || !strcmp( s, "!segwit" ) )
|
||||
{
|
||||
segwit = true;
|
||||
if ( opt_debug )
|
||||
applog( LOG_INFO, "GBT: SegWit is enabled" );
|
||||
}
|
||||
}
|
||||
}
|
||||
// Segwit END
|
||||
@@ -903,7 +914,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
}
|
||||
for ( i = 0; i < ARRAY_SIZE( work->target ); i++ )
|
||||
work->target[7 - i] = be32dec( target + i );
|
||||
|
||||
net_diff = work->targetdiff = hash_to_diff( work->target );
|
||||
|
||||
tmp = json_object_get( val, "workid" );
|
||||
if ( tmp )
|
||||
{
|
||||
@@ -953,25 +965,25 @@ void scale_hash_for_display ( double* hashrate, char* prefix )
|
||||
else { *prefix = 'Y'; *hashrate /= 1e24; }
|
||||
}
|
||||
|
||||
static inline void sprintf_et( char *str, int seconds )
|
||||
static inline void sprintf_et( char *str, long unsigned int seconds )
|
||||
{
|
||||
// sprintf doesn't like uint64_t, Linux thinks it's long, Windows long long.
|
||||
unsigned int min = seconds / 60;
|
||||
unsigned int sec = seconds % 60;
|
||||
unsigned int hrs = min / 60;
|
||||
long unsigned int min = seconds / 60;
|
||||
long unsigned int sec = seconds % 60;
|
||||
long unsigned int hrs = min / 60;
|
||||
|
||||
if ( unlikely( hrs ) )
|
||||
{
|
||||
unsigned int years = hrs / (24*365);
|
||||
unsigned int days = hrs / 24;
|
||||
if ( years )
|
||||
sprintf( str, "%uy%ud", years, years % 365 );
|
||||
else if ( days ) //0d00h
|
||||
sprintf( str, "%ud%02uh", days, hrs % 24 );
|
||||
long unsigned int days = hrs / 24;
|
||||
long unsigned int years = days / 365;
|
||||
if ( years ) // 0y000d
|
||||
sprintf( str, "%luy%lud", years, years % 365 );
|
||||
else if ( days ) // 0d00h
|
||||
sprintf( str, "%lud%02luh", days, hrs % 24 );
|
||||
else // 0h00m
|
||||
sprintf( str, "%uh%02um", hrs, min % 60 );
|
||||
sprintf( str, "%luh%02lum", hrs, min % 60 );
|
||||
}
|
||||
else // 0m00s
|
||||
sprintf( str, "%um%02us", min, sec );
|
||||
sprintf( str, "%lum%02lus", min, sec );
|
||||
}
|
||||
|
||||
const long double exp32 = EXP32; // 2**32
|
||||
@@ -1042,6 +1054,8 @@ void report_summary_log( bool force )
|
||||
applog( LOG_NOTICE,"CPU temp: curr %s max %d, Freq: %.3f/%.3f GHz",
|
||||
tempstr, hi_temp, lo_freq / 1e6, hi_freq / 1e6 );
|
||||
if ( curr_temp > hi_temp ) hi_temp = curr_temp;
|
||||
if ( ( opt_max_temp > 0.0 ) && ( curr_temp > opt_max_temp ) )
|
||||
restart_threads();
|
||||
prev_temp = curr_temp;
|
||||
}
|
||||
}
|
||||
@@ -1070,12 +1084,12 @@ void report_summary_log( bool force )
|
||||
|
||||
double share_time = (double)et.tv_sec + (double)et.tv_usec / 1e6;
|
||||
double ghrate = global_hashrate;
|
||||
double shrate = share_time == 0. ? 0. : exp32 * last_targetdiff
|
||||
* (double)(accepts) / share_time;
|
||||
double sess_hrate = uptime.tv_sec == 0. ? 0. : exp32 * norm_diff_sum
|
||||
/ (double)uptime.tv_sec;
|
||||
double submit_rate = share_time == 0. ? 0. : (double)submits*60.
|
||||
/ share_time;
|
||||
double target_diff = exp32 * last_targetdiff;
|
||||
double shrate = safe_div( target_diff * (double)(accepts),
|
||||
share_time, 0. );
|
||||
double sess_hrate = safe_div( exp32 * norm_diff_sum,
|
||||
(double)uptime.tv_sec, 0. );
|
||||
double submit_rate = safe_div( (double)submits * 60., share_time, 0. );
|
||||
char shr_units[4] = {0};
|
||||
char ghr_units[4] = {0};
|
||||
char sess_hr_units[4] = {0};
|
||||
@@ -1092,55 +1106,57 @@ void report_summary_log( bool force )
|
||||
applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url );
|
||||
applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str );
|
||||
applog2( LOG_INFO, "Share rate %.2f/min %.2f/min",
|
||||
submit_rate, (double)submitted_share_count*60. /
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
|
||||
submit_rate, (double)submitted_share_count*60. /
|
||||
( (double)uptime.tv_sec + (double)uptime.tv_usec / 1e6 ) );
|
||||
applog2( LOG_INFO, "Hash rate %7.2f%sh/s %7.2f%sh/s (%.2f%sh/s)",
|
||||
shrate, shr_units, sess_hrate, sess_hr_units,
|
||||
ghrate, ghr_units );
|
||||
shrate, shr_units, sess_hrate, sess_hr_units, ghrate, ghr_units );
|
||||
|
||||
if ( accepted_share_count < submitted_share_count )
|
||||
{
|
||||
double ltd = exp32 * last_targetdiff;
|
||||
double lost_ghrate = uptime.tv_sec == 0 ? 0.
|
||||
: ltd * (double)(submitted_share_count - accepted_share_count )
|
||||
: target_diff
|
||||
* (double)(submitted_share_count - accepted_share_count )
|
||||
/ (double)uptime.tv_sec;
|
||||
double lost_shrate = share_time == 0. ? 0.
|
||||
: ltd * (double)(submits - accepts ) / share_time;
|
||||
: target_diff * (double)(submits - accepts ) / share_time;
|
||||
char lshr_units[4] = {0};
|
||||
char lghr_units[4] = {0};
|
||||
scale_hash_for_display( &lost_shrate, lshr_units );
|
||||
scale_hash_for_display( &lost_ghrate, lghr_units );
|
||||
applog2( LOG_INFO, "Lost hash rate %7.2f%sh/s %7.2f%sh/s",
|
||||
lost_shrate, lshr_units, lost_ghrate, lghr_units );
|
||||
applog2( LOG_INFO, "Lost hash rate %7.2f%sh/s %7.2f%sh/s",
|
||||
lost_shrate, lshr_units, lost_ghrate, lghr_units );
|
||||
}
|
||||
|
||||
applog2( LOG_INFO,"Submitted %6d %6d",
|
||||
submits, submitted_share_count );
|
||||
applog2( LOG_INFO,"Accepted %6d %6d %5.1f%%",
|
||||
accepts, accepted_share_count,
|
||||
100. * accepted_share_count / submitted_share_count );
|
||||
applog2( LOG_INFO,"Submitted %7d %7d",
|
||||
submits, submitted_share_count );
|
||||
applog2( LOG_INFO, "Accepted %7d %7d %5.1f%%",
|
||||
accepts, accepted_share_count,
|
||||
100. * safe_div( (double)accepted_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
if ( stale_share_count )
|
||||
applog2( LOG_INFO,"Stale %6d %6d %5.1f%%",
|
||||
stales, stale_share_count,
|
||||
100. * stale_share_count / submitted_share_count );
|
||||
applog2( LOG_INFO, "Stale %7d %7d %5.1f%%",
|
||||
stales, stale_share_count,
|
||||
100. * safe_div( (double)stale_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
if ( rejected_share_count )
|
||||
applog2( LOG_INFO,"Rejected %6d %6d %5.1f%%",
|
||||
rejects, rejected_share_count,
|
||||
100. * rejected_share_count / submitted_share_count );
|
||||
applog2( LOG_INFO, "Rejected %7d %7d %5.1f%%",
|
||||
rejects, rejected_share_count,
|
||||
100. * safe_div( (double)rejected_share_count,
|
||||
(double)submitted_share_count, 0. ) );
|
||||
if ( solved_block_count )
|
||||
applog2( LOG_INFO,"Blocks Solved %6d %6d",
|
||||
solved, solved_block_count );
|
||||
applog2( LOG_INFO,"Blocks Solved %7d %7d",
|
||||
solved, solved_block_count );
|
||||
applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g",
|
||||
highest_share, lowest_share );
|
||||
highest_share, lowest_share );
|
||||
|
||||
static int64_t no_acks = 0;
|
||||
if ( no_acks )
|
||||
{
|
||||
no_acks = submitted_share_count
|
||||
int mismatch = submitted_share_count
|
||||
- ( accepted_share_count + stale_share_count + rejected_share_count );
|
||||
if ( no_acks ) // 2 consecutive cycles non zero
|
||||
applog(LOG_WARNING,"Share count mismatch: %d, stats may be incorrect",
|
||||
no_acks );
|
||||
if ( mismatch )
|
||||
{
|
||||
if ( mismatch != 1 )
|
||||
applog(LOG_WARNING,"Share count mismatch: %d, stats may be inaccurate", mismatch );
|
||||
else
|
||||
applog(LOG_INFO,"Share count mismatch, submitted share may still be pending" );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1158,7 +1174,8 @@ static int share_result( int result, struct work *work,
|
||||
char bres[48];
|
||||
bool solved = false;
|
||||
bool stale = false;
|
||||
char *acol = NULL, *bcol = NULL, *scol = NULL, *rcol = NULL;
|
||||
char *acol, *bcol, *scol, *rcol;
|
||||
acol = bcol = scol = rcol = "\0";
|
||||
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
@@ -1200,7 +1217,7 @@ static int share_result( int result, struct work *work,
|
||||
sprintf( sres, "S%d", stale_share_count );
|
||||
sprintf( rres, "R%d", rejected_share_count );
|
||||
if unlikely( ( my_stats.net_diff > 0. )
|
||||
&& ( my_stats.share_diff >= net_diff ) )
|
||||
&& ( my_stats.share_diff >= my_stats.net_diff ) )
|
||||
{
|
||||
solved = true;
|
||||
solved_block_count++;
|
||||
@@ -1294,7 +1311,8 @@ static int share_result( int result, struct work *work,
|
||||
if ( reason ) applog( LOG_WARNING, "Reject reason: %s", reason );
|
||||
|
||||
diff_to_hash( str, my_stats.share_diff );
|
||||
applog2( LOG_INFO, "Hash: %08x%08x%08x...", str[7], str[6], str[5] );
|
||||
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
|
||||
str[5], str[4], str[3],str[2], str[1], str[0] );
|
||||
|
||||
if ( work )
|
||||
targ = work->target;
|
||||
@@ -1303,7 +1321,8 @@ static int share_result( int result, struct work *work,
|
||||
diff_to_hash( str, my_stats.target_diff );
|
||||
targ = &str[0];
|
||||
}
|
||||
applog2( LOG_INFO, "Target: %08x%08x%08x...", targ[7], targ[6], targ[5] );
|
||||
applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
|
||||
targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
@@ -2076,10 +2095,10 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
else if ( !opt_quiet )
|
||||
{
|
||||
unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
|
||||
g_work->xnonce2_len );
|
||||
applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g",
|
||||
xnonce2str, sctx->block_height, net_diff );
|
||||
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
|
||||
g_work->xnonce2_len );
|
||||
applog( LOG_INFO, "Extranonce2 %s, Block %d, Job %s",
|
||||
xnonce2str, sctx->block_height, g_work->job_id );
|
||||
free( xnonce2str );
|
||||
}
|
||||
|
||||
@@ -2162,11 +2181,11 @@ static void *miner_thread( void *userdata )
|
||||
/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
|
||||
* and if that fails, then SCHED_BATCH. No need for this to be an
|
||||
* error if it fails */
|
||||
if (!opt_benchmark && opt_priority == 0)
|
||||
if ( !opt_priority )
|
||||
{
|
||||
setpriority(PRIO_PROCESS, 0, 19);
|
||||
if ( !thr_id && !opt_quiet )
|
||||
applog(LOG_INFO, "Miner thread priority %d (nice 19)", opt_priority );
|
||||
if ( !thr_id && opt_debug )
|
||||
applog(LOG_INFO, "Default miner thread priority %d (nice 19)", opt_priority );
|
||||
drop_policy();
|
||||
}
|
||||
else
|
||||
@@ -2183,9 +2202,12 @@ static void *miner_thread( void *userdata )
|
||||
case 4: prio = -10; break;
|
||||
case 5: prio = -15;
|
||||
}
|
||||
if ( !( thr_id || opt_quiet ) )
|
||||
applog( LOG_INFO, "Miner thread priority %d (nice %d)",
|
||||
if ( !thr_id )
|
||||
{
|
||||
applog( LOG_INFO, "User set miner thread priority %d (nice %d)",
|
||||
opt_priority, prio );
|
||||
applog( LOG_WARNING, "High priority mining threads may cause system instability");
|
||||
}
|
||||
#endif
|
||||
setpriority(PRIO_PROCESS, 0, prio);
|
||||
if ( opt_priority == 0 )
|
||||
@@ -2430,13 +2452,17 @@ static void *miner_thread( void *userdata )
|
||||
char hr_units[2] = {0,0};
|
||||
scale_hash_for_display( &hashrate, hr_units );
|
||||
sprintf( hr, "%.2f", hashrate );
|
||||
#if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
|
||||
#if (defined(_WIN64) || defined(__WINDOWS__) || defined(_WIN32))
|
||||
applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
|
||||
#else
|
||||
applog( LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC",
|
||||
hr, hr_units, (uint32_t)cpu_temp(0) );
|
||||
float lo_freq = 0., hi_freq = 0.;
|
||||
linux_cpu_hilo_freq( &lo_freq, &hi_freq );
|
||||
applog( LOG_NOTICE,
|
||||
"Total: %s %sH/s, Temp: %dC, Freq: %.3f/%.3f GHz",
|
||||
hr, hr_units, (uint32_t)cpu_temp(0), lo_freq / 1e6,
|
||||
hi_freq / 1e6 );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} // benchmark
|
||||
|
||||
// conditional mining
|
||||
@@ -2726,10 +2752,10 @@ static void *stratum_thread(void *userdata )
|
||||
stratum.url = strdup( rpc_url );
|
||||
applog(LOG_BLUE, "Connection changed to %s", short_url);
|
||||
}
|
||||
else // if ( !opt_quiet )
|
||||
else
|
||||
applog(LOG_WARNING, "Stratum connection reset");
|
||||
// reset stats queue as well
|
||||
s_get_ptr = s_put_ptr = 0;
|
||||
if ( s_get_ptr != s_put_ptr ) s_get_ptr = s_put_ptr = 0;
|
||||
}
|
||||
|
||||
while ( !stratum.curl )
|
||||
@@ -2776,13 +2802,15 @@ static void *stratum_thread(void *userdata )
|
||||
else
|
||||
{
|
||||
applog(LOG_WARNING, "Stratum connection interrupted");
|
||||
stratum_disconnect( &stratum );
|
||||
// stratum_disconnect( &stratum );
|
||||
stratum_need_reset = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
applog(LOG_ERR, "Stratum connection timeout");
|
||||
stratum_disconnect( &stratum );
|
||||
stratum_need_reset = true;
|
||||
// stratum_disconnect( &stratum );
|
||||
}
|
||||
|
||||
} // loop
|
||||
@@ -2790,6 +2818,187 @@ out:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void show_credits()
|
||||
{
|
||||
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
||||
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
||||
printf(" with AVX512, SHA and VAES extensions by JayDDee.\n");
|
||||
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
||||
}
|
||||
|
||||
#define check_cpu_capability() cpu_capability( false )
|
||||
#define display_cpu_capability() cpu_capability( true )
|
||||
static bool cpu_capability( bool display_only )
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_avx512 = false;
|
||||
bool sw_has_sha = false;
|
||||
bool sw_has_vaes = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
bool use_vaes;
|
||||
bool use_none;
|
||||
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
#ifdef __SSE4_2__
|
||||
sw_has_sse42 = true;
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
sw_has_avx = true;
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
sw_has_avx2 = true;
|
||||
#endif
|
||||
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
|
||||
sw_has_avx512 = true;
|
||||
#endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#endif
|
||||
#ifdef __VAES__
|
||||
sw_has_vaes = true;
|
||||
#endif
|
||||
|
||||
|
||||
// #if !((__AES__) || (__SSE2__))
|
||||
// printf("Neither __AES__ nor __SSE2__ defined.\n");
|
||||
// #endif
|
||||
|
||||
cpu_brand_string( cpu_brand );
|
||||
printf( "CPU: %s\n", cpu_brand );
|
||||
|
||||
printf("SW built on " __DATE__
|
||||
#ifdef _MSC_VER
|
||||
" with VC++ 2013\n");
|
||||
#elif defined(__GNUC__)
|
||||
" with GCC");
|
||||
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
|
||||
#else
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
printf("CPU features: ");
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
|
||||
printf("\nSW features: ");
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
printf("\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_vaes ||
|
||||
algo_has_vaes256 ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
if ( display_only ) return true;
|
||||
|
||||
// Check for CPU and build incompatibilities
|
||||
if ( !cpu_has_sse2 )
|
||||
{
|
||||
printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES and AVX2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sse42 && !cpu_has_sse42 )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SSE4.2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_aes && !cpu_has_aes )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sha && !cpu_has_sha )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SHA!\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_vaes = cpu_has_vaes && sw_has_vaes && ( algo_has_vaes
|
||||
|| algo_has_vaes256 );
|
||||
use_none = !( use_sse2 || use_aes || use_avx512 || use_avx2 ||
|
||||
use_sha || use_vaes );
|
||||
|
||||
// Display best options
|
||||
printf( "\nStarting miner with" );
|
||||
if ( use_none ) printf( " no optimizations" );
|
||||
else
|
||||
{
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_vaes ) printf( " VAES" );
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( "...\n\n" );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void show_version_and_exit(void)
|
||||
{
|
||||
printf("\n built on " __DATE__
|
||||
@@ -2837,7 +3046,6 @@ void show_version_and_exit(void)
|
||||
#endif
|
||||
"\n\n");
|
||||
|
||||
/* dependencies versions */
|
||||
printf("%s\n", curl_version());
|
||||
#ifdef JANSSON_VERSION
|
||||
printf("jansson/%s ", JANSSON_VERSION);
|
||||
@@ -2849,7 +3057,6 @@ void show_version_and_exit(void)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
void show_usage_and_exit(int status)
|
||||
{
|
||||
if (status)
|
||||
@@ -3186,14 +3393,12 @@ void parse_arg(int key, char *arg )
|
||||
ul = strtoull( p, NULL, 16 );
|
||||
else
|
||||
ul = atoll( arg );
|
||||
// if ( ul > ( 1ULL << num_cpus ) - 1ULL )
|
||||
// ul = -1LL;
|
||||
#if AFFINITY_USES_UINT128
|
||||
// replicate the low 64 bits to make a full 128 bit mask if there are more
|
||||
// than 64 CPUs, otherwise zero extend the upper half.
|
||||
opt_affinity = (uint128_t)ul;
|
||||
if ( num_cpus > 64 )
|
||||
opt_affinity = (opt_affinity << 64 ) | opt_affinity;
|
||||
opt_affinity |= opt_affinity << 64;
|
||||
#else
|
||||
opt_affinity = ul;
|
||||
#endif
|
||||
@@ -3237,11 +3442,15 @@ void parse_arg(int key, char *arg )
|
||||
case 1024:
|
||||
opt_randomize = true;
|
||||
break;
|
||||
case 1026:
|
||||
opt_reset_on_stale = true;
|
||||
case 1027: // data-file
|
||||
opt_data_file = strdup( arg );
|
||||
break;
|
||||
case 1028: // verify
|
||||
opt_verify = true;
|
||||
break;
|
||||
case 'V':
|
||||
show_version_and_exit();
|
||||
display_cpu_capability();
|
||||
exit(0);
|
||||
case 'h':
|
||||
show_usage_and_exit(0);
|
||||
|
||||
@@ -3358,185 +3567,6 @@ static int thread_create(struct thr_info *thr, void* func)
|
||||
return err;
|
||||
}
|
||||
|
||||
static void show_credits()
|
||||
{
|
||||
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
||||
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
||||
printf(" with AVX512, SHA and VAES extensions by JayDDee.\n");
|
||||
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
||||
}
|
||||
|
||||
bool check_cpu_capability ()
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_avx512 = false;
|
||||
bool sw_has_sha = false;
|
||||
bool sw_has_vaes = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
bool use_vaes;
|
||||
bool use_none;
|
||||
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
#ifdef __SSE4_2__
|
||||
sw_has_sse42 = true;
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
sw_has_avx = true;
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
sw_has_avx2 = true;
|
||||
#endif
|
||||
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
|
||||
sw_has_avx512 = true;
|
||||
#endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#endif
|
||||
#ifdef __VAES__
|
||||
sw_has_vaes = true;
|
||||
#endif
|
||||
|
||||
|
||||
// #if !((__AES__) || (__SSE2__))
|
||||
// printf("Neither __AES__ nor __SSE2__ defined.\n");
|
||||
// #endif
|
||||
|
||||
cpu_brand_string( cpu_brand );
|
||||
printf( "CPU: %s\n", cpu_brand );
|
||||
|
||||
printf("SW built on " __DATE__
|
||||
#ifdef _MSC_VER
|
||||
" with VC++ 2013\n");
|
||||
#elif defined(__GNUC__)
|
||||
" with GCC");
|
||||
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
|
||||
#else
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
printf("CPU features: ");
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
|
||||
printf("\nSW features: ");
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
printf("\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_vaes ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// Check for CPU and build incompatibilities
|
||||
if ( !cpu_has_sse2 )
|
||||
{
|
||||
printf( "A CPU with SSE2 is required to use cpuminer-opt\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES and AVX2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sse42 && !cpu_has_sse42 )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SSE4.2!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_aes && !cpu_has_aes )
|
||||
{
|
||||
printf( "The SW build requires a CPU with AES!\n" );
|
||||
return false;
|
||||
}
|
||||
if ( sw_has_sha && !cpu_has_sha )
|
||||
{
|
||||
printf( "The SW build requires a CPU with SHA!\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes
|
||||
&& ( use_avx512 || algo_has_vaes256 );
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
|
||||
use_sha || use_vaes );
|
||||
|
||||
// Display best options
|
||||
printf( "\nStarting miner with" );
|
||||
if ( use_none ) printf( " no optimizations" );
|
||||
else
|
||||
{
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_vaes ) printf( " VAES" );
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( "...\n\n" );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void get_defconfig_path(char *out, size_t bufsize, char *argv0);
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
@@ -3598,6 +3628,11 @@ int main(int argc, char *argv[])
|
||||
fprintf(stderr, "%s: no algo supplied\n", argv[0]);
|
||||
show_usage_and_exit(1);
|
||||
}
|
||||
|
||||
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
|
||||
|
||||
if ( !check_cpu_capability() ) exit(1);
|
||||
|
||||
if ( !opt_benchmark )
|
||||
{
|
||||
if ( !short_url )
|
||||
@@ -3637,7 +3672,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
// All options must be set before starting the gate
|
||||
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
|
||||
// if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
|
||||
|
||||
if ( coinbase_address )
|
||||
{
|
||||
@@ -3656,7 +3691,7 @@ int main(int argc, char *argv[])
|
||||
memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
|
||||
memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
|
||||
|
||||
if ( !check_cpu_capability() ) exit(1);
|
||||
// if ( !check_cpu_capability() ) exit(1);
|
||||
|
||||
pthread_mutex_init( &stats_lock, NULL );
|
||||
pthread_rwlock_init( &g_work_lock, NULL );
|
||||
|
14
miner.h
14
miner.h
@@ -307,6 +307,7 @@ extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
|
||||
extern void cbin2hex(char *out, const char *in, size_t len);
|
||||
void bin2hex( char *s, const unsigned char *p, size_t len );
|
||||
char *abin2hex( const unsigned char *p, size_t len );
|
||||
char *bebin2hex( const unsigned char *p, size_t len );
|
||||
bool hex2bin( unsigned char *p, const char *hexstr, size_t len );
|
||||
bool jobj_binary( const json_t *obj, const char *key, void *buf,
|
||||
size_t buflen );
|
||||
@@ -573,6 +574,7 @@ enum algos {
|
||||
ALGO_TRIBUS,
|
||||
ALGO_VANILLA,
|
||||
ALGO_VELTOR,
|
||||
ALGO_VERTHASH,
|
||||
ALGO_WHIRLPOOL,
|
||||
ALGO_WHIRLPOOLX,
|
||||
ALGO_X11,
|
||||
@@ -665,6 +667,7 @@ static const char* const algo_names[] = {
|
||||
"tribus",
|
||||
"vanilla",
|
||||
"veltor",
|
||||
"verthash",
|
||||
"whirlpool",
|
||||
"whirlpoolx",
|
||||
"x11",
|
||||
@@ -735,7 +738,6 @@ extern uint32_t opt_work_size;
|
||||
extern double *thr_hashrates;
|
||||
extern double global_hashrate;
|
||||
extern double stratum_diff;
|
||||
extern bool opt_reset_on_stale;
|
||||
extern double net_diff;
|
||||
extern double net_hashrate;
|
||||
extern int opt_param_n;
|
||||
@@ -760,6 +762,8 @@ extern pthread_mutex_t stats_lock;
|
||||
extern bool opt_sapling;
|
||||
extern const int pk_buffer_size_max;
|
||||
extern int pk_buffer_size;
|
||||
extern char *opt_data_file;
|
||||
extern bool opt_verify;
|
||||
|
||||
static char const usage[] = "\
|
||||
Usage: cpuminer [OPTIONS]\n\
|
||||
@@ -824,6 +828,7 @@ Options:\n\
|
||||
tribus Denarius (DNR)\n\
|
||||
vanilla blake256r8vnl (VCash)\n\
|
||||
veltor\n\
|
||||
verthash\n\
|
||||
whirlpool\n\
|
||||
whirlpoolx\n\
|
||||
x11 Dash\n\
|
||||
@@ -896,12 +901,14 @@ Options:\n\
|
||||
--benchmark run in offline benchmark mode\n\
|
||||
--cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
|
||||
--cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\
|
||||
-b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4048)\n\
|
||||
-b, --api-bind=address[:port] IP address for the miner API, default port is 4048)\n\
|
||||
--api-remote Allow remote control\n\
|
||||
--max-temp=N Only mine if cpu temp is less than specified value (linux)\n\
|
||||
--max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
|
||||
--max-diff=N Only mine if net difficulty is less than specified value\n\
|
||||
-c, --config=FILE load a JSON-format configuration file\n\
|
||||
--data-file path and name of data file\n\
|
||||
--verify enable additional time consuming start up tests\n\
|
||||
-V, --version display version information and exit\n\
|
||||
-h, --help display this help text and exit\n\
|
||||
";
|
||||
@@ -959,7 +966,6 @@ static struct option const options[] = {
|
||||
{ "retries", 1, NULL, 'r' },
|
||||
{ "retry-pause", 1, NULL, 1025 },
|
||||
{ "randomize", 0, NULL, 1024 },
|
||||
{ "reset-on-stale", 0, NULL, 1026 },
|
||||
{ "scantime", 1, NULL, 's' },
|
||||
#ifdef HAVE_SYSLOG_H
|
||||
{ "syslog", 0, NULL, 'S' },
|
||||
@@ -970,6 +976,8 @@ static struct option const options[] = {
|
||||
{ "url", 1, NULL, 'o' },
|
||||
{ "user", 1, NULL, 'u' },
|
||||
{ "userpass", 1, NULL, 'O' },
|
||||
{ "data-file", 1, NULL, 1027 },
|
||||
{ "verify", 0, NULL, 1028 },
|
||||
{ "version", 0, NULL, 'V' },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
@@ -1225,37 +1225,6 @@ static inline void intrlv_4x64( void *dst, const void *src0,
|
||||
d[31] = _mm_unpackhi_epi64( s2[7], s3[7] );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void intrlv_4x64( void *dst, void *src0,
|
||||
void *src1, void *src2, void *src3, int bit_len )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;
|
||||
uint64_t *s0 = (uint64_t*)src0;
|
||||
uint64_t *s1 = (uint64_t*)src1;
|
||||
uint64_t *s2 = (uint64_t*)src2;
|
||||
uint64_t *s3 = (uint64_t*)src3;
|
||||
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0];
|
||||
d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1];
|
||||
d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2];
|
||||
d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4];
|
||||
d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5];
|
||||
d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6];
|
||||
d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d[ 32] = s0[ 8]; d[ 33] = s1[ 8]; d[ 34] = s2[ 8]; d[ 35] = s3[ 8];
|
||||
d[ 36] = s0[ 9]; d[ 37] = s1[ 9]; d[ 38] = s2[ 9]; d[ 39] = s3[ 9];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d[ 40] = s0[10]; d[ 41] = s1[10]; d[ 42] = s2[10]; d[ 43] = s3[10];
|
||||
d[ 44] = s0[11]; d[ 45] = s1[11]; d[ 46] = s2[11]; d[ 47] = s3[11];
|
||||
d[ 48] = s0[12]; d[ 49] = s1[12]; d[ 50] = s2[12]; d[ 51] = s3[12];
|
||||
d[ 52] = s0[13]; d[ 53] = s1[13]; d[ 54] = s2[13]; d[ 55] = s3[13];
|
||||
d[ 56] = s0[14]; d[ 57] = s1[14]; d[ 58] = s2[14]; d[ 59] = s3[14];
|
||||
d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15];
|
||||
}
|
||||
*/
|
||||
|
||||
static inline void intrlv_4x64_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3 )
|
||||
{
|
||||
@@ -1282,26 +1251,6 @@ static inline void intrlv_4x64_512( void *dst, const void *src0,
|
||||
d[15] = _mm_unpackhi_epi64( s2[3], s3[3] );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void intrlv_4x64_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3 )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;
|
||||
const uint64_t *s0 = (const uint64_t*)src0;
|
||||
const uint64_t *s1 = (const uint64_t*)src1;
|
||||
const uint64_t *s2 = (const uint64_t*)src2;
|
||||
const uint64_t *s3 = (const uint64_t*)src3;
|
||||
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0];
|
||||
d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1];
|
||||
d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2];
|
||||
d[ 12] = s0[ 3]; d[ 13] = s1[ 3]; d[ 14] = s2[ 3]; d[ 15] = s3[ 3];
|
||||
d[ 16] = s0[ 4]; d[ 17] = s1[ 4]; d[ 18] = s2[ 4]; d[ 19] = s3[ 4];
|
||||
d[ 20] = s0[ 5]; d[ 21] = s1[ 5]; d[ 22] = s2[ 5]; d[ 23] = s3[ 5];
|
||||
d[ 24] = s0[ 6]; d[ 25] = s1[ 6]; d[ 26] = s2[ 6]; d[ 27] = s3[ 6];
|
||||
d[ 28] = s0[ 7]; d[ 29] = s1[ 7]; d[ 30] = s2[ 7]; d[ 31] = s3[ 7];
|
||||
}
|
||||
*/
|
||||
|
||||
static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, const int bit_len )
|
||||
{
|
||||
@@ -1347,38 +1296,6 @@ static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
|
||||
d3[7] = _mm_unpackhi_epi64( s[29], s[31] );
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static inline void dintrlv_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
uint64_t *d2 = (uint64_t*)dst2;
|
||||
uint64_t *d3 = (uint64_t*)dst3;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3];
|
||||
d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7];
|
||||
d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11];
|
||||
d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19];
|
||||
d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23];
|
||||
d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27];
|
||||
d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d0[ 8] = s[32]; d1[ 8] = s[33]; d2[ 8] = s[34]; d3[ 8] = s[35];
|
||||
d0[ 9] = s[36]; d1[ 9] = s[37]; d2[ 9] = s[38]; d3[ 9] = s[39];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d0[10] = s[40]; d1[10] = s[41]; d2[10] = s[42]; d3[10] = s[43];
|
||||
d0[11] = s[44]; d1[11] = s[45]; d2[11] = s[46]; d3[11] = s[47];
|
||||
d0[12] = s[48]; d1[12] = s[49]; d2[12] = s[50]; d3[12] = s[51];
|
||||
d0[13] = s[52]; d1[13] = s[53]; d2[13] = s[54]; d3[13] = s[55];
|
||||
d0[14] = s[56]; d1[14] = s[57]; d2[14] = s[58]; d3[14] = s[59];
|
||||
d0[15] = s[60]; d1[15] = s[61]; d2[15] = s[62]; d3[15] = s[63];
|
||||
}
|
||||
*/
|
||||
|
||||
static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src )
|
||||
{
|
||||
@@ -1405,26 +1322,6 @@ static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
|
||||
d3[3] = _mm_unpackhi_epi64( s[13], s[15] );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline void dintrlv_4x64_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src )
|
||||
{
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
uint64_t *d2 = (uint64_t*)dst2;
|
||||
uint64_t *d3 = (uint64_t*)dst3;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d2[ 0] = s[ 2]; d3[ 0] = s[ 3];
|
||||
d0[ 1] = s[ 4]; d1[ 1] = s[ 5]; d2[ 1] = s[ 6]; d3[ 1] = s[ 7];
|
||||
d0[ 2] = s[ 8]; d1[ 2] = s[ 9]; d2[ 2] = s[10]; d3[ 2] = s[11];
|
||||
d0[ 3] = s[12]; d1[ 3] = s[13]; d2[ 3] = s[14]; d3[ 3] = s[15];
|
||||
d0[ 4] = s[16]; d1[ 4] = s[17]; d2[ 4] = s[18]; d3[ 4] = s[19];
|
||||
d0[ 5] = s[20]; d1[ 5] = s[21]; d2[ 5] = s[22]; d3[ 5] = s[23];
|
||||
d0[ 6] = s[24]; d1[ 6] = s[25]; d2[ 6] = s[26]; d3[ 6] = s[27];
|
||||
d0[ 7] = s[28]; d1[ 7] = s[29]; d2[ 7] = s[30]; d3[ 7] = s[31];
|
||||
}
|
||||
*/
|
||||
|
||||
static inline void extr_lane_4x64( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
@@ -1440,9 +1337,41 @@ static inline void extr_lane_4x64( void *d, const void *s,
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code.
|
||||
|
||||
// There a alignment problems with the source buffer on Wwindows,
|
||||
// can't use 256 bit bswap.
|
||||
static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
__m128i s3 = casti_m128i( src,3 );
|
||||
__m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
casti_m128i( d, 0 ) =
|
||||
casti_m128i( d, 1 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_m128i( d, 2 ) =
|
||||
casti_m128i( d, 3 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
|
||||
casti_m128i( d, 4 ) =
|
||||
casti_m128i( d, 5 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_m128i( d, 6 ) =
|
||||
casti_m128i( d, 7 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
|
||||
casti_m128i( d, 8 ) =
|
||||
casti_m128i( d, 9 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_m128i( d, 10 ) =
|
||||
casti_m128i( d, 11 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
|
||||
casti_m128i( d, 12 ) =
|
||||
casti_m128i( d, 13 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_m128i( d, 14 ) =
|
||||
casti_m128i( d, 15 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_m128i( d, 16 ) =
|
||||
casti_m128i( d, 17 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_m128i( d, 18 ) =
|
||||
casti_m128i( d, 19 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
}
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
@@ -1636,40 +1565,6 @@ static inline void intrlv_8x64_512( void *dst, const void *src0,
|
||||
d[31] = _mm_unpackhi_epi64( s6[3], s7[3] );
|
||||
}
|
||||
|
||||
/*
|
||||
#define ILEAVE_8x64( i ) do \
|
||||
{ \
|
||||
uint64_t *d = (uint64_t*)(dst) + ( (i) << 3 ); \
|
||||
d[0] = *( (const uint64_t*)(s0) +(i) ); \
|
||||
d[1] = *( (const uint64_t*)(s1) +(i) ); \
|
||||
d[2] = *( (const uint64_t*)(s2) +(i) ); \
|
||||
d[3] = *( (const uint64_t*)(s3) +(i) ); \
|
||||
d[4] = *( (const uint64_t*)(s4) +(i) ); \
|
||||
d[5] = *( (const uint64_t*)(s5) +(i) ); \
|
||||
d[6] = *( (const uint64_t*)(s6) +(i) ); \
|
||||
d[7] = *( (const uint64_t*)(s7) +(i) ); \
|
||||
} while(0)
|
||||
|
||||
static inline void intrlv_8x64( void *dst, const void *s0,
|
||||
const void *s1, const void *s2, const void *s3, const void *s4,
|
||||
const void *s5, const void *s6, const void *s7, int bit_len )
|
||||
{
|
||||
ILEAVE_8x64( 0 ); ILEAVE_8x64( 1 );
|
||||
ILEAVE_8x64( 2 ); ILEAVE_8x64( 3 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
ILEAVE_8x64( 4 ); ILEAVE_8x64( 5 );
|
||||
ILEAVE_8x64( 6 ); ILEAVE_8x64( 7 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
ILEAVE_8x64( 8 ); ILEAVE_8x64( 9 );
|
||||
if ( bit_len <= 640 ) return;
|
||||
ILEAVE_8x64( 10 ); ILEAVE_8x64( 11 );
|
||||
ILEAVE_8x64( 12 ); ILEAVE_8x64( 13 );
|
||||
ILEAVE_8x64( 14 ); ILEAVE_8x64( 15 );
|
||||
}
|
||||
|
||||
#undef ILEAVE_8x64
|
||||
*/
|
||||
|
||||
|
||||
static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||
@@ -1815,39 +1710,6 @@ static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
|
||||
d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
|
||||
}
|
||||
|
||||
/*
|
||||
#define DLEAVE_8x64( i ) do \
|
||||
{ \
|
||||
const uint64_t *s = (const uint64_t*)(src) + ( (i) << 3 ); \
|
||||
*( (uint64_t*)(d0) +(i) ) = s[0]; \
|
||||
*( (uint64_t*)(d1) +(i) ) = s[1]; \
|
||||
*( (uint64_t*)(d2) +(i) ) = s[2]; \
|
||||
*( (uint64_t*)(d3) +(i) ) = s[3]; \
|
||||
*( (uint64_t*)(d4) +(i) ) = s[4]; \
|
||||
*( (uint64_t*)(d5) +(i) ) = s[5]; \
|
||||
*( (uint64_t*)(d6) +(i) ) = s[6]; \
|
||||
*( (uint64_t*)(d7) +(i) ) = s[7]; \
|
||||
} while(0)
|
||||
|
||||
static inline void dintrlv_8x64( void *d0, void *d1, void *d2, void *d3,
|
||||
void *d4, void *d5, void *d6, void *d7, const void *src, int bit_len )
|
||||
{
|
||||
DLEAVE_8x64( 0 ); DLEAVE_8x64( 1 );
|
||||
DLEAVE_8x64( 2 ); DLEAVE_8x64( 3 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
DLEAVE_8x64( 4 ); DLEAVE_8x64( 5 );
|
||||
DLEAVE_8x64( 6 ); DLEAVE_8x64( 7 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
DLEAVE_8x64( 8 ); DLEAVE_8x64( 9 );
|
||||
if ( bit_len <= 640 ) return;
|
||||
DLEAVE_8x64( 10 ); DLEAVE_8x64( 11 );
|
||||
DLEAVE_8x64( 12 ); DLEAVE_8x64( 13 );
|
||||
DLEAVE_8x64( 14 ); DLEAVE_8x64( 15 );
|
||||
}
|
||||
|
||||
#undef DLEAVE_8x64
|
||||
*/
|
||||
|
||||
static inline void extr_lane_8x64( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
|
@@ -178,7 +178,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
|
||||
// Bitwise not (~v)
|
||||
#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 )
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
|
||||
// Unary negation of elements (-v)
|
||||
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
||||
@@ -237,6 +237,25 @@ static inline void memset_128( __m128i *dst, const __m128i a, const int n )
|
||||
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm128_xor3( a, b, c ) \
|
||||
_mm_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm128_xorand( a, b, c ) \
|
||||
_mm_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_xor3( a, b, c ) \
|
||||
_mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
|
||||
#define mm128_xorand( a, b, c ) \
|
||||
_mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
@@ -263,7 +282,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX512VL__)
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
@@ -291,16 +311,13 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
//#define mm128_swap_64( v ) _mm_alignr_epi8( v, v, 8 )
|
||||
//#define mm128_ror_1x32( v ) _mm_alignr_epi8( v, v, 4 )
|
||||
//#define mm128_rol_1x32( v ) _mm_alignr_epi8( v, v, 12 )
|
||||
|
||||
// Swap 32 bit elements in 64 bit lanes
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
static inline __m128i mm128_ror_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
|
@@ -18,7 +18,7 @@
|
||||
#define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
|
||||
#define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )
|
||||
|
||||
// Mo0ve low element of vector to integer.
|
||||
// Move low element of vector to integer.
|
||||
#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
|
||||
#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
|
||||
@@ -42,7 +42,7 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
// 128 bit vector argument
|
||||
#define m256_const1_128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
// 64 bit integer argument
|
||||
// 64 bit integer argument zero extended to 128 bits.
|
||||
#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
|
||||
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
@@ -136,9 +136,84 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#define mm256_add4_8( a, b, c, d ) \
|
||||
_mm256_add_epi8( _mm256_add_epi8( a, b ), _mm256_add_epi8( c, d ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// AVX512 has ternary logic that supports any 3 input boolean expression.
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// legacy convenience only
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( a, mm256_xor3( b, c, d ) )
|
||||
|
||||
// a & b & c
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm256_andxor( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b | c )
|
||||
#define mm256_xoror( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c )
|
||||
#define mm256_xorandnot( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm256_orand( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm256_xnor( a, b ) \
|
||||
_mm256_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
|
||||
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
|
||||
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_andxor( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_xor_si256( b, c ))
|
||||
|
||||
#define mm256_xoror( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
|
||||
|
||||
#define mm256_xorandnot( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
|
||||
|
||||
#define mm256_orand( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
||||
#define mm256_xnor( a, b ) \
|
||||
mm256_not( _mm256_xor_si256( a, b ) )
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
//
|
||||
@@ -168,7 +243,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
// The spec says both F & VL are required, but just in case AMD
|
||||
// decides to implement ROL/R without AVX512F.
|
||||
#if defined(__AVX512VL__)
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
// AVX512, control must be 8 bit immediate.
|
||||
|
||||
@@ -197,13 +275,8 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
//
|
||||
// AVX2 has no full vector permute for elements less than 32 bits.
|
||||
// AVX512 has finer granularity full vector permutes.
|
||||
// AVX512 has full vector alignr which might be faster, especially for 32 bit
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
static inline __m256i mm256_swap_128( const __m256i v )
|
||||
{ return _mm256_alignr_epi64( v, v, 2 ); }
|
||||
@@ -220,12 +293,6 @@ static inline __m256i mm256_ror_1x32( const __m256i v )
|
||||
static inline __m256i mm256_rol_1x32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 7 ); }
|
||||
|
||||
static inline __m256i mm256_ror_3x32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 3 ); }
|
||||
|
||||
static inline __m256i mm256_rol_3x32( const __m256i v )
|
||||
{ return _mm256_alignr_epi32( v, v, 5 ); }
|
||||
|
||||
#else // AVX2
|
||||
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
@@ -246,17 +313,7 @@ static inline __m256i mm256_rol_3x32( const __m256i v )
|
||||
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 )
|
||||
|
||||
// Rotate 256 bit vector by three 32 bit elements (96 bits).
|
||||
#define mm256_ror_3x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
m256_const_64( 0x0000000200000001, 0x0000000000000007, \
|
||||
0x0000000600000005, 0x0000000400000003 )
|
||||
|
||||
#define mm256_rol_3x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, \
|
||||
m256_const_64( 0x0000000400000003, 0x0000000200000001, \
|
||||
0x0000000000000007, 0x0000000600000005 )
|
||||
|
||||
|
||||
#endif // AVX512 else AVX2
|
||||
|
||||
//
|
||||
|
@@ -61,7 +61,7 @@
|
||||
//
|
||||
// Additionally, permutations using smaller vectors can be more efficient
|
||||
// if the permutation doesn't cross lane boundaries, typically 128 bits,
|
||||
// and the smnaller vector can use an imm comtrol.
|
||||
// and the smaller vector can use an imm comtrol.
|
||||
//
|
||||
// If the permutation doesn't cross lane boundaries a shuffle instructions
|
||||
// can be used with imm control instead of permute.
|
||||
@@ -107,7 +107,7 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
return v.m512i;
|
||||
}
|
||||
|
||||
// Equivalent of set1, broadcast lo element all elements.
|
||||
// Equivalent of set1, broadcast lo element to all elements.
|
||||
static inline __m512i m512_const1_256( const __m256i v )
|
||||
{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }
|
||||
|
||||
@@ -166,7 +166,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// ~x
|
||||
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
// #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||
static inline __m512i mm512_not( const __m512i x )
|
||||
{ return _mm512_ternarylogic_epi64( x, x, x, 1 ); }
|
||||
|
||||
// -x
|
||||
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
|
||||
@@ -221,11 +223,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_add4_8( a, b, c, d ) \
|
||||
_mm512_add_epi8( _mm512_add_epi8( a, b ), _mm512_add_epi8( c, d ) )
|
||||
|
||||
#define mm512_xor4( a, b, c, d ) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( a, b ), _mm512_xor_si512( c, d ) )
|
||||
|
||||
|
||||
//
|
||||
// Ternary logic uses 8 bit truth table to define any 3 input logical
|
||||
// operation using any number or combinations of AND, OR XOR, NOT.
|
||||
|
||||
// a ^ b ^ c
|
||||
#define mm512_xor3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x96 )
|
||||
|
||||
// legacy convenience only
|
||||
#define mm512_xor4( a, b, c, d ) \
|
||||
_mm512_xor_si512( a, mm512_xor3( b, c, d ) )
|
||||
|
||||
// a & b & c
|
||||
#define mm512_and3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x80 )
|
||||
|
||||
// a | b | c
|
||||
#define mm512_or3( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xfe )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm512_xorand( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x78 )
|
||||
|
||||
// a & ( b ^ c )
|
||||
#define mm512_andxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x60 )
|
||||
|
||||
// a ^ ( b & c )
|
||||
#define mm512_xoror( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0x1e )
|
||||
|
||||
// a ^ ( ~b & c ) [ xor( a, andnot( b, c ) ]
|
||||
#define mm512_xorandnot( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xd2 )
|
||||
|
||||
// a | ( b & c )
|
||||
#define mm512_orand( a, b, c ) \
|
||||
_mm512_ternarylogic_epi64( a, b, c, 0xf8 )
|
||||
|
||||
// Some 2 input operations that don't have their own instruction mnemonic.
|
||||
|
||||
// ~( a | b )
|
||||
#define mm512_nor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x01 )
|
||||
|
||||
// ~( a ^ b ), same as (~a) ^ b
|
||||
#define mm512_xnor( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0x81 )
|
||||
|
||||
// ~( a & b )
|
||||
#define mm512_nand( a, b ) \
|
||||
_mm512_ternarylogic_epi64( a, b, b, 0xef )
|
||||
|
||||
|
||||
// Bit rotations.
|
||||
|
||||
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
||||
|
@@ -5,6 +5,10 @@
|
||||
#define bswap_64( a ) __builtin_bswap64( a )
|
||||
#define bswap_32( a ) __builtin_bswap32( a )
|
||||
|
||||
// safe division, integer or floating point
|
||||
#define safe_div( dividend, divisor, safe_result ) \
|
||||
( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) ) )
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
|
165
util.c
165
util.c
@@ -795,6 +795,15 @@ char *abin2hex(const unsigned char *p, size_t len)
|
||||
return s;
|
||||
}
|
||||
|
||||
char *bebin2hex(const unsigned char *p, size_t len)
|
||||
{
|
||||
char *s = (char*) malloc((len * 2) + 1);
|
||||
if (!s) return NULL;
|
||||
for ( size_t i = 0, j = len - 1; i < len; i++, j-- )
|
||||
sprintf( s + ( i*2 ), "%02x", (unsigned int) p[ j ] );
|
||||
return s;
|
||||
}
|
||||
|
||||
bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
|
||||
{
|
||||
char hex_byte[3];
|
||||
@@ -943,6 +952,140 @@ bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen)
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint32_t bech32_polymod_step(uint32_t pre) {
|
||||
uint8_t b = pre >> 25;
|
||||
return ((pre & 0x1FFFFFF) << 5) ^
|
||||
(-((b >> 0) & 1) & 0x3b6a57b2UL) ^
|
||||
(-((b >> 1) & 1) & 0x26508e6dUL) ^
|
||||
(-((b >> 2) & 1) & 0x1ea119faUL) ^
|
||||
(-((b >> 3) & 1) & 0x3d4233ddUL) ^
|
||||
(-((b >> 4) & 1) & 0x2a1462b3UL);
|
||||
}
|
||||
|
||||
static const int8_t bech32_charset_rev[128] = {
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
15, -1, 10, 17, 21, 20, 26, 30, 7, 5, -1, -1, -1, -1, -1, -1,
|
||||
-1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1,
|
||||
1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1,
|
||||
-1, 29, -1, 24, 13, 25, 9, 8, 23, -1, 18, 22, 31, 27, 19, -1,
|
||||
1, 0, 3, 16, 11, 28, 12, 14, 6, 4, 2, -1, -1, -1, -1, -1
|
||||
};
|
||||
|
||||
static bool bech32_decode(char *hrp, uint8_t *data, size_t *data_len, const char *input) {
|
||||
uint32_t chk = 1;
|
||||
size_t i;
|
||||
size_t input_len = strlen(input);
|
||||
size_t hrp_len;
|
||||
int have_lower = 0, have_upper = 0;
|
||||
if (input_len < 8 || input_len > 90) {
|
||||
return false;
|
||||
}
|
||||
*data_len = 0;
|
||||
while (*data_len < input_len && input[(input_len - 1) - *data_len] != '1') {
|
||||
++(*data_len);
|
||||
}
|
||||
hrp_len = input_len - (1 + *data_len);
|
||||
if (1 + *data_len >= input_len || *data_len < 6) {
|
||||
return false;
|
||||
}
|
||||
*(data_len) -= 6;
|
||||
for (i = 0; i < hrp_len; ++i) {
|
||||
int ch = input[i];
|
||||
if (ch < 33 || ch > 126) {
|
||||
return false;
|
||||
}
|
||||
if (ch >= 'a' && ch <= 'z') {
|
||||
have_lower = 1;
|
||||
} else if (ch >= 'A' && ch <= 'Z') {
|
||||
have_upper = 1;
|
||||
ch = (ch - 'A') + 'a';
|
||||
}
|
||||
hrp[i] = ch;
|
||||
chk = bech32_polymod_step(chk) ^ (ch >> 5);
|
||||
}
|
||||
hrp[i] = 0;
|
||||
chk = bech32_polymod_step(chk);
|
||||
for (i = 0; i < hrp_len; ++i) {
|
||||
chk = bech32_polymod_step(chk) ^ (input[i] & 0x1f);
|
||||
}
|
||||
++i;
|
||||
while (i < input_len) {
|
||||
int v = (input[i] & 0x80) ? -1 : bech32_charset_rev[(int)input[i]];
|
||||
if (input[i] >= 'a' && input[i] <= 'z') have_lower = 1;
|
||||
if (input[i] >= 'A' && input[i] <= 'Z') have_upper = 1;
|
||||
if (v == -1) {
|
||||
return false;
|
||||
}
|
||||
chk = bech32_polymod_step(chk) ^ v;
|
||||
if (i + 6 < input_len) {
|
||||
data[i - (1 + hrp_len)] = v;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (have_lower && have_upper) {
|
||||
return false;
|
||||
}
|
||||
return chk == 1;
|
||||
}
|
||||
|
||||
static bool convert_bits(uint8_t *out, size_t *outlen, int outbits, const uint8_t *in, size_t inlen, int inbits, int pad) {
|
||||
uint32_t val = 0;
|
||||
int bits = 0;
|
||||
uint32_t maxv = (((uint32_t)1) << outbits) - 1;
|
||||
while (inlen--) {
|
||||
val = (val << inbits) | *(in++);
|
||||
bits += inbits;
|
||||
while (bits >= outbits) {
|
||||
bits -= outbits;
|
||||
out[(*outlen)++] = (val >> bits) & maxv;
|
||||
}
|
||||
}
|
||||
if (pad) {
|
||||
if (bits) {
|
||||
out[(*outlen)++] = (val << (outbits - bits)) & maxv;
|
||||
}
|
||||
} else if (((val << (outbits - bits)) & maxv) || bits >= inbits) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool segwit_addr_decode(int *witver, uint8_t *witdata, size_t *witdata_len, const char *addr) {
|
||||
uint8_t data[84];
|
||||
char hrp_actual[84];
|
||||
size_t data_len;
|
||||
if (!bech32_decode(hrp_actual, data, &data_len, addr)) return false;
|
||||
if (data_len == 0 || data_len > 65) return false;
|
||||
if (data[0] > 16) return false;
|
||||
*witdata_len = 0;
|
||||
if (!convert_bits(witdata, witdata_len, 8, data + 1, data_len - 1, 5, 0)) return false;
|
||||
if (*witdata_len < 2 || *witdata_len > 40) return false;
|
||||
if (data[0] == 0 && *witdata_len != 20 && *witdata_len != 32) return false;
|
||||
*witver = data[0];
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t bech32_to_script(uint8_t *out, size_t outsz, const char *addr) {
|
||||
uint8_t witprog[40];
|
||||
size_t witprog_len;
|
||||
int witver;
|
||||
|
||||
if (!segwit_addr_decode(&witver, witprog, &witprog_len, addr))
|
||||
return 0;
|
||||
if (outsz < witprog_len + 2)
|
||||
return 0;
|
||||
out[0] = witver ? (0x50 + witver) : 0;
|
||||
out[1] = witprog_len;
|
||||
memcpy(out + 2, witprog, witprog_len);
|
||||
|
||||
if ( opt_debug )
|
||||
applog( LOG_INFO, "Coinbase address uses Bech32 coding");
|
||||
|
||||
return witprog_len + 2;
|
||||
}
|
||||
|
||||
size_t address_to_script( unsigned char *out, size_t outsz, const char *addr )
|
||||
{
|
||||
unsigned char addrbin[ pk_buffer_size_max ];
|
||||
@@ -950,12 +1093,15 @@ size_t address_to_script( unsigned char *out, size_t outsz, const char *addr )
|
||||
size_t rv;
|
||||
|
||||
if ( !b58dec( addrbin, outsz, addr ) )
|
||||
return 0;
|
||||
return bech32_to_script( out, outsz, addr );
|
||||
|
||||
addrver = b58check( addrbin, outsz, addr );
|
||||
if ( addrver < 0 )
|
||||
return 0;
|
||||
|
||||
if ( opt_debug )
|
||||
applog( LOG_INFO, "Coinbase address uses B58 coding");
|
||||
|
||||
switch ( addrver )
|
||||
{
|
||||
case 5: /* Bitcoin script hash */
|
||||
@@ -1486,9 +1632,6 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
|
||||
if ( !opt_quiet ) /* pool dynamic change */
|
||||
applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
|
||||
xnonce1, xn2_size);
|
||||
// if (pndx == 0 && opt_debug)
|
||||
// applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
|
||||
// xnonce1, xn2_size);
|
||||
|
||||
return true;
|
||||
out:
|
||||
@@ -1638,8 +1781,6 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
|
||||
opt_extranonce = false;
|
||||
goto out;
|
||||
}
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Extranonce subscription enabled" );
|
||||
|
||||
sret = stratum_recv_line( sctx );
|
||||
if ( sret )
|
||||
@@ -1657,10 +1798,14 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
|
||||
if ( !stratum_handle_method( sctx, sret ) )
|
||||
applog( LOG_WARNING, "Stratum answer id is not correct!" );
|
||||
}
|
||||
res_val = json_object_get( extra, "result" );
|
||||
// if (opt_debug && (!res_val || json_is_false(res_val)))
|
||||
// applog(LOG_DEBUG, "extranonce subscribe not supported");
|
||||
json_decref( extra );
|
||||
else
|
||||
{
|
||||
res_val = json_object_get( extra, "result" );
|
||||
if ( opt_debug && ( !res_val || json_is_false( res_val ) ) )
|
||||
applog( LOG_DEBUG,
|
||||
"Method extranonce.subscribe is not supported" );
|
||||
}
|
||||
json_decref( extra );
|
||||
}
|
||||
free(sret);
|
||||
}
|
||||
|
80
verthash-help.txt
Normal file
80
verthash-help.txt
Normal file
@@ -0,0 +1,80 @@
|
||||
Quickstart:
|
||||
----------
|
||||
|
||||
First time mining verthash or don't have a Verthash data file:
|
||||
|
||||
--algo verthash --verify --url ...
|
||||
|
||||
Verthash data file already exists:
|
||||
|
||||
--algo verthash --data-file /path/to/verthash.dat --url ...
|
||||
|
||||
|
||||
Background:
|
||||
----------
|
||||
|
||||
Verthash algorithm requires a data file for hashing. This file is
|
||||
static, portable, and only needs to be created once.
|
||||
|
||||
A Verthash data file created by VerthashMiner can also be used by cpuminer-opt
|
||||
and used simultaneously by both miners.
|
||||
|
||||
Due to its size >1GB it is recommened one data file be created and
|
||||
stored in a permanent location accessible to any miner that wants to use it.
|
||||
|
||||
New command line options:
|
||||
------------------------
|
||||
|
||||
cpuminer-opt adds two new command line options for verthash. The names
|
||||
and some behaviour is changed from VerthashMiner.
|
||||
|
||||
--data-file /path/to/verthash.dat
|
||||
default when not used is verthash.dat in current working directory.
|
||||
|
||||
--verify
|
||||
verify integrity of file specified by --data-file, or if not specified
|
||||
the default data file if it exists, or create a default file and verify it
|
||||
if one does not yet exist. Data file verification is disabled by default.
|
||||
|
||||
Detailed usage:
|
||||
--------------
|
||||
|
||||
If a data file already exists it can be selected using the --data-file
|
||||
option to specify the path and name of the file.
|
||||
|
||||
--algo verthash --datafile /path/to/verthash.dat --url ...
|
||||
|
||||
If the --data-file option is not used the default is to use 'verthash.dat'
|
||||
from the current working directory.
|
||||
|
||||
If no data file exists it can be created by using the --verify option
|
||||
without the --data-file option. If the default data file is not found in
|
||||
the current directory it will be created.
|
||||
|
||||
--algo verthash --verify --url ...
|
||||
|
||||
Data file creation can take up to 30 minutes on a spinning hard drive.
|
||||
Once created the new data file will be verified and used immediately
|
||||
if a valid url and user were included on the command line.
|
||||
|
||||
A default data file can be created by ommitting the url option. That will
|
||||
either verify an existing default data file or create one and verify it,
|
||||
then exit.
|
||||
|
||||
--algo verthash --verify
|
||||
|
||||
A data file will never be created if --data-file is specified. The miner
|
||||
will exit with an error if the file is not found. This is to avoid accidentally
|
||||
creating an unwanted data file due to a typo.
|
||||
|
||||
After creation the data file can moved to a more convenient location and
|
||||
referenced by --data-file, or left where it is and used by default without the
|
||||
--data-file option.
|
||||
|
||||
Data file verification takes a few seconds and is disabled by default.
|
||||
VerthashMiner enables data file verification by default and has an option to
|
||||
disable it.
|
||||
|
||||
The --verify option is intended primarily to create a new file. It's
|
||||
not necessary or useful to verify a file every time the miner is started.
|
||||
|
@@ -31,6 +31,7 @@ mkdir release
|
||||
cp README.txt release/
|
||||
cp README.md release/
|
||||
cp RELEASE_NOTES release/
|
||||
cp verthash-help.txt release/
|
||||
cp $MINGW_LIB/zlib1.dll release/
|
||||
cp $MINGW_LIB/libwinpthread-1.dll release/
|
||||
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
|
||||
@@ -106,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe
|
||||
# Westmere SSE4.2 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS
|
||||
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
|
Reference in New Issue
Block a user