mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
5 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
a90d75b8f5 | ||
![]() |
bee78eac76 | ||
![]() |
2d2e54f001 | ||
![]() |
79164c24b5 | ||
![]() |
7a1389998b |
87
Makefile.am
87
Makefile.am
@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/ar2/cores.c \
|
||||
algo/argon2/ar2/ar2-scrypt-jane.c \
|
||||
algo/argon2/ar2/blake2b.c \
|
||||
algo/axiom.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake-hash-4way.c \
|
||||
algo/blake/blake-gate.c \
|
||||
@@ -47,8 +46,10 @@ cpuminer_SOURCES = \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
algo/blake/blakecoin-4way.c \
|
||||
algo/blake/decred-gate.c \
|
||||
algo/blake/decred.c \
|
||||
algo/blake/decred-4way.c \
|
||||
@@ -56,6 +57,7 @@ cpuminer_SOURCES = \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
algo/bmw/sph_bmw.c \
|
||||
algo/bmw/bmw-hash-4way.c \
|
||||
algo/bmw/bmw256.c \
|
||||
algo/cryptonight/cryptolight.c \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
@@ -63,10 +65,8 @@ cpuminer_SOURCES = \
|
||||
algo/cryptonight/cryptonight.c\
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/cubehash/sse2/cubehash_sse2.c\
|
||||
algo/drop.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/fresh.c \
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
|
||||
algo/heavy/sph_hefty1.c \
|
||||
algo/heavy/heavy.c \
|
||||
algo/heavy/bastion.c \
|
||||
algo/hmq1725.c \
|
||||
algo/hodl/aes.c \
|
||||
algo/hodl/hodl-gate.c \
|
||||
algo/hodl/hodl-wolf.c \
|
||||
@@ -102,21 +101,27 @@ cpuminer_SOURCES = \
|
||||
algo/luffa/sse2/luffa_for_sse2.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/lyra2rev2-gate.c \
|
||||
algo/lyra2/lyra2rev2.c \
|
||||
algo/lyra2/lyra2rev2-4way.c \
|
||||
algo/lyra2/lyra2re.c \
|
||||
algo/lyra2/lyra2z-gate.c \
|
||||
algo/lyra2/lyra2z.c \
|
||||
algo/lyra2/lyra2z-4way.c \
|
||||
algo/lyra2/lyra2z330.c \
|
||||
algo/lyra2/lyra2h-gate.c \
|
||||
algo/lyra2/lyra2h.c \
|
||||
algo/lyra2/lyra2h-4way.c \
|
||||
algo/m7m.c \
|
||||
algo/neoscrypt.c \
|
||||
algo/neoscrypt/neoscrypt.c \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/pluck.c \
|
||||
algo/polytimos/polytimos-gate.c \
|
||||
algo/polytimos/polytimos.c \
|
||||
algo/quark/quark-gate.c \
|
||||
algo/quark/quark.c \
|
||||
algo/quark/quark-4way.c \
|
||||
algo/qubit/qubit.c \
|
||||
algo/qubit/deep.c \
|
||||
algo/ripemd/sph_ripemd.c \
|
||||
@@ -127,7 +132,9 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/sse2/nist.c \
|
||||
@@ -136,41 +143,77 @@ cpuminer_SOURCES = \
|
||||
algo/skein/skein-hash-4way.c \
|
||||
algo/skein/skein.c \
|
||||
algo/skein/skein-4way.c \
|
||||
algo/skein/skein-gate.c \
|
||||
algo/skein/skein-gate.c \
|
||||
algo/skein/skein2.c \
|
||||
algo/skein/skein2-4way.c \
|
||||
algo/skein/skein2-gate.c \
|
||||
algo/skunk.c \
|
||||
algo/sm3/sm3.c \
|
||||
algo/sm3/sm3-hash-4way.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/timetravel.c \
|
||||
algo/timetravel10.c \
|
||||
algo/tribus/tribus-gate.c \
|
||||
algo/tribus/tribus.c \
|
||||
algo/tribus/tribus-4way.c \
|
||||
algo/veltor.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
algo/whirlpool/whirlpool-4way.c \
|
||||
algo/whirlpool/whirlpool.c \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
algo/x11/phi1612.c \
|
||||
algo/x11/x11-gate.c \
|
||||
algo/x11/x11.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11-4way.c \
|
||||
algo/x11/x11gost-gate.c \
|
||||
algo/x11/x11gost.c \
|
||||
algo/x11/x11gost-4way.c \
|
||||
algo/x11/c11-gate.c \
|
||||
algo/x11/c11.c \
|
||||
algo/x11/c11-4way.c \
|
||||
algo/x11/tribus-gate.c \
|
||||
algo/x11/tribus.c \
|
||||
algo/x11/tribus-4way.c \
|
||||
algo/x11/timetravel-gate.c \
|
||||
algo/x11/timetravel.c \
|
||||
algo/x11/timetravel-4way.c \
|
||||
algo/x11/timetravel10-gate.c \
|
||||
algo/x11/timetravel10.c \
|
||||
algo/x11/timetravel10-4way.c \
|
||||
algo/x11/fresh.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11evo-4way.c \
|
||||
algo/x11/x11evo-gate.c \
|
||||
algo/x13/x13-gate.c \
|
||||
algo/x13/x13.c \
|
||||
algo/x13/x13-4way.c \
|
||||
algo/x13/x13sm3-gate.c \
|
||||
algo/x13/x13sm3.c \
|
||||
algo/x13/x13sm3-4way.c \
|
||||
algo/x13/phi1612-gate.c \
|
||||
algo/x13/phi1612.c \
|
||||
algo/x13/phi1612-4way.c \
|
||||
algo/x13/skunk-gate.c \
|
||||
algo/x13/skunk-4way.c \
|
||||
algo/x13/skunk.c \
|
||||
algo/x13/drop.c \
|
||||
algo/x14/x14-gate.c \
|
||||
algo/x14/x14.c \
|
||||
algo/x14/x14-4way.c \
|
||||
algo/x14/veltor-gate.c \
|
||||
algo/x14/veltor.c \
|
||||
algo/x14/veltor-4way.c \
|
||||
algo/x14/polytimos-gate.c \
|
||||
algo/x14/polytimos.c \
|
||||
algo/x14/polytimos-4way.c \
|
||||
algo/x14/axiom.c \
|
||||
algo/x15/x15-gate.c \
|
||||
algo/x15/x15.c \
|
||||
algo/x15/x15-4way.c \
|
||||
algo/x17/x17-gate.c \
|
||||
algo/x17/x17.c \
|
||||
algo/xevan.c \
|
||||
algo/x17/x17-4way.c \
|
||||
algo/x17/xevan-gate.c \
|
||||
algo/x17/xevan.c \
|
||||
algo/x17/xevan-4way.c \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c\
|
||||
algo/yescrypt/yescrypt-simd.c\
|
||||
algo/zr5.c
|
||||
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-simd.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
|
11
README.md
11
README.md
@@ -40,6 +40,7 @@ Supported Algorithms
|
||||
keccakc Creative coin
|
||||
lbry LBC, LBRY Credits
|
||||
luffa Luffa
|
||||
lyra2h Hppcoin
|
||||
lyra2re lyra2
|
||||
lyra2rev2 lyra2v2, Vertcoin
|
||||
lyra2z Zcoin (XZC)
|
||||
@@ -67,7 +68,7 @@ Supported Algorithms
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor
|
||||
veltor (VLT)
|
||||
whirlpool
|
||||
whirlpoolx
|
||||
x11 Dash
|
||||
@@ -80,6 +81,7 @@ Supported Algorithms
|
||||
x17
|
||||
xevan Bitsend
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)
|
||||
zr5 Ziftr
|
||||
|
||||
@@ -95,13 +97,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
|
||||
ARM CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
|
||||
may work wallet mining but there are no guarantees.
|
||||
MacOS, OSx is not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork.
|
||||
|
||||
Errata
|
||||
------
|
||||
|
20
README.txt
20
README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Exe name Compile opts Arch name
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe -march=core2 Core2
|
||||
cpuminer-sse42.exe -march=corei7 Nehalem
|
||||
cpuminer-aes-sse42.exe -maes -msse4.2" Westmere
|
||||
cpuminer-aes-avx.exe -march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-aes-avx2.exe "-march=core-avx2" Haswell, Broadwell, Skylake, Kabylake
|
||||
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY"
|
||||
cpuminer-sse2.exe "-march=core2" Core2
|
||||
cpuminer-sse42.exe "-march=corei7" Nehalem
|
||||
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell...
|
||||
cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen...
|
||||
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2
|
||||
cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
|
||||
|
||||
4way requires a CPU with AES and AVX2. It is still under development and
|
||||
only a few algos are supported. See change log in RELEASE_NOTES in source
|
||||
package for supported algos.
|
||||
|
||||
There is no binary support available for SHA on AMD Ryzen CPUs.
|
||||
Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
|
||||
is provided. Four way still uses AVX2.
|
||||
|
||||
|
@@ -27,8 +27,9 @@ Compile Instructions
|
||||
|
||||
Requirements:
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU.
|
||||
64 bit Linux or Windows operating system.
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux or Windows operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
|
||||
@@ -164,6 +165,35 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.7.10
|
||||
|
||||
4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
|
||||
x11evo, blakecoin.
|
||||
Faster x13sm3 (hsr).
|
||||
Added share difficulty to accepted message.
|
||||
|
||||
v3.7.9
|
||||
|
||||
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
|
||||
Additional 4way optimizations for X algos.
|
||||
New algo yescryptr8 for BitZeny, not to be confused with original
|
||||
yescrypt Globalboost-Y.
|
||||
|
||||
v3.7.8
|
||||
|
||||
Partial 4way optimization for most X algos including c11, xevan, phi, hsr
|
||||
|
||||
v3.7.7
|
||||
|
||||
Fixed regression caused by 64 CPU support.
|
||||
Fixed lyra2h.
|
||||
|
||||
v3.7.6
|
||||
|
||||
Added lyra2h algo for Hppcoin.
|
||||
Added support for more than 64 CPUs.
|
||||
Optimized shavite512 with AES, improves x11 etc.
|
||||
|
||||
v3.7.5
|
||||
|
||||
New algo keccakc for Creative coin with 4way optimizations
|
||||
@@ -171,7 +201,7 @@ New algo keccakc for Creative coin with 4way optimizations
|
||||
Rewrote some AVX/AVX2 code for more consistent implementation and some
|
||||
optimizing.
|
||||
|
||||
Enhanced capabilities check to support 4way, mor eprecise reporting of
|
||||
Enhanced capabilities check to support 4way, more precise reporting of
|
||||
features (not all algos use SSE2), and better error messages when using
|
||||
an incompatible pre-built version (Windows users).
|
||||
|
||||
|
@@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
|
||||
gate->work_cmp_size = STD_WORK_CMP_SIZE;
|
||||
}
|
||||
|
||||
// Ignore warnings for not yet defined register functions
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
// called by each thread that uses the gate
|
||||
bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
@@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
|
||||
switch (algo)
|
||||
{
|
||||
|
||||
// Ignore warnings for not yet defined register fucntions
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
|
||||
|
||||
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
@@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
|
||||
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_sib_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
@@ -219,12 +219,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
|
||||
// restore warnings
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
return false;
|
||||
@@ -239,6 +236,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
// restore warnings
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
@@ -279,6 +279,7 @@ const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
@@ -301,10 +302,9 @@ const char* const algo_alias_map[][2] =
|
||||
// { "sia", "blake2b" },
|
||||
{ "sib", "x11gost" },
|
||||
{ "timetravel8", "timetravel" },
|
||||
{ "yes", "yescrypt" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ "yenten", "yescryptr16" },
|
||||
{ "yescryptr8", "yescrypt" },
|
||||
{ "yescryptr8k", "yescrypt" },
|
||||
{ "zcoin", "lyra2z" },
|
||||
{ "zoin", "lyra2z330" },
|
||||
{ NULL, NULL }
|
||||
|
@@ -1,107 +1,90 @@
|
||||
#include "blake-gate.h"
|
||||
#include "sph_blake.h"
|
||||
|
||||
#if defined (__AVX__)
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if defined (BLAKE_4WAY)
|
||||
blake256r14_4way_context blake_ctx;
|
||||
|
||||
void blakehash_4way(void *state, const void *input)
|
||||
{
|
||||
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[4] __attribute__ ((aligned (32)));
|
||||
blake256_4way_context ctx;
|
||||
|
||||
blake256_4way_init( &ctx );
|
||||
blake256_4way( &ctx, input, 16 );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash1, 32 );
|
||||
memcpy( state+96, hash1, 32 );
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r14_4way_context ctx;
|
||||
memcpy( &ctx, &blake_ctx, sizeof ctx );
|
||||
blake256r14_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
// uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
|
||||
// if (opt_benchmark)
|
||||
// HTarget = 0x7f;
|
||||
if (opt_benchmark)
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
|
||||
endiandata, 640 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
blake256r14_4way_init( &blake_ctx );
|
||||
blake256r14_4way( &blake_ctx, vdata, 64 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +2, n+1 );
|
||||
be32enc( noncep +4, n+2 );
|
||||
be32enc( noncep +6, n+3 );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
|
||||
blakehash_4way( hash, vdata );
|
||||
|
||||
if ( hash[7] == 0 )
|
||||
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
pdata[19] = n;
|
||||
}
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] == 0 )
|
||||
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
}
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] == 0 )
|
||||
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
}
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] == 0 )
|
||||
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
|
||||
n += 4;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
gate->hash = (void*)&blakehash_4way;
|
||||
four_way_not_tested();
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake;
|
||||
gate->hash = (void*)&blakehash;
|
||||
|
@@ -36,7 +36,6 @@
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
//#include "sph_blake.h"
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
/*
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||
14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3
|
||||
11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4
|
||||
7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8
|
||||
9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13
|
||||
2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9
|
||||
12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11
|
||||
13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10
|
||||
6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5
|
||||
10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0
|
||||
*/
|
||||
#endif
|
||||
|
||||
#define Z00 0
|
||||
@@ -504,14 +491,9 @@ do { \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
//#define BLAKE32_ROUNDS 8
|
||||
#ifndef BLAKE32_ROUNDS
|
||||
#define BLAKE32_ROUNDS 14
|
||||
#endif
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
#define COMPRESS32_4WAY do { \
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
@@ -524,18 +506,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
|
||||
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
|
||||
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
|
||||
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
|
||||
@@ -552,7 +534,7 @@ do { \
|
||||
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
|
||||
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
|
||||
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
|
||||
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
|
||||
for (r = 0; r < rounds; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
|
||||
@@ -576,80 +558,70 @@ do { \
|
||||
|
||||
// current impl
|
||||
|
||||
#define COMPRESS32_4WAY do { \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M0 = mm_byteswap_32( * buf ); \
|
||||
M1 = mm_byteswap_32( *(buf+1) ); \
|
||||
M2 = mm_byteswap_32( *(buf+2) ); \
|
||||
M3 = mm_byteswap_32( *(buf+3) ); \
|
||||
M4 = mm_byteswap_32( *(buf+4) ); \
|
||||
M5 = mm_byteswap_32( *(buf+5) ); \
|
||||
M6 = mm_byteswap_32( *(buf+6) ); \
|
||||
M7 = mm_byteswap_32( *(buf+7) ); \
|
||||
M8 = mm_byteswap_32( *(buf+8) ); \
|
||||
M9 = mm_byteswap_32( *(buf+9) ); \
|
||||
MA = mm_byteswap_32( *(buf+10) ); \
|
||||
MB = mm_byteswap_32( *(buf+11) ); \
|
||||
MC = mm_byteswap_32( *(buf+12) ); \
|
||||
MD = mm_byteswap_32( *(buf+13) ); \
|
||||
ME = mm_byteswap_32( *(buf+14) ); \
|
||||
MF = mm_byteswap_32( *(buf+15) ); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
ROUND_S_4WAY(4); \
|
||||
ROUND_S_4WAY(5); \
|
||||
ROUND_S_4WAY(6); \
|
||||
ROUND_S_4WAY(7); \
|
||||
if (BLAKE32_ROUNDS == 14) { \
|
||||
ROUND_S_4WAY(8); \
|
||||
ROUND_S_4WAY(9); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( V8, V0 ), S0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( V9, V1 ), S1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VA, V2 ), S2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VB, V3 ), S3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VC, V4 ), S0 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VD, V5 ), S1 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VE, V6 ), S2 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( VF, V7 ), S3 ), H7 ); \
|
||||
} while (0)
|
||||
#define COMPRESS32_4WAY( rounds ) \
|
||||
do { \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
M0 = mm_byteswap_32( * buf ); \
|
||||
M1 = mm_byteswap_32( *(buf+1) ); \
|
||||
M2 = mm_byteswap_32( *(buf+2) ); \
|
||||
M3 = mm_byteswap_32( *(buf+3) ); \
|
||||
M4 = mm_byteswap_32( *(buf+4) ); \
|
||||
M5 = mm_byteswap_32( *(buf+5) ); \
|
||||
M6 = mm_byteswap_32( *(buf+6) ); \
|
||||
M7 = mm_byteswap_32( *(buf+7) ); \
|
||||
M8 = mm_byteswap_32( *(buf+8) ); \
|
||||
M9 = mm_byteswap_32( *(buf+9) ); \
|
||||
MA = mm_byteswap_32( *(buf+10) ); \
|
||||
MB = mm_byteswap_32( *(buf+11) ); \
|
||||
MC = mm_byteswap_32( *(buf+12) ); \
|
||||
MD = mm_byteswap_32( *(buf+13) ); \
|
||||
ME = mm_byteswap_32( *(buf+14) ); \
|
||||
MF = mm_byteswap_32( *(buf+15) ); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
ROUND_S_4WAY(4); \
|
||||
ROUND_S_4WAY(5); \
|
||||
ROUND_S_4WAY(6); \
|
||||
ROUND_S_4WAY(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_S_4WAY(8); \
|
||||
ROUND_S_4WAY(9); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
} \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -710,18 +682,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
|
||||
@@ -845,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt)
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm_set1_epi32( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -867,7 +840,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
@@ -892,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
COMPRESS32_4WAY;
|
||||
COMPRESS32_4WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
@@ -915,48 +887,44 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
unsigned z = 0x80 >> n;
|
||||
unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 48 )
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
_mm_set1_epi32( 0x01000000UL ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
memset_zero_128( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf, 64 );
|
||||
}
|
||||
out = (__m128i*)dst;
|
||||
@@ -974,9 +942,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
|
||||
sc->S[i] = _mm256_set1_epi64x( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -1048,12 +1016,12 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
@@ -1065,10 +1033,7 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
|
||||
_mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 ) );
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
@@ -1081,15 +1046,11 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( u.buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 );
|
||||
|
||||
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
@@ -1104,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
|
||||
#endif
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
void
|
||||
blake256_4way_init(void *cc)
|
||||
{
|
||||
blake32_4way_init(cc, IV256, salt_zero_small);
|
||||
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1119,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
|
||||
void
|
||||
blake256_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake256_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
// 14 rounds blake, decred
|
||||
void blake256r14_4way_init(void *cc)
|
||||
{
|
||||
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
blake256r14_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way_close(cc, ub, n, dst, 8);
|
||||
blake32_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
// 8 rounds blakecoin, vanilla
|
||||
void blake256r8_4way_init(void *cc)
|
||||
{
|
||||
blake32_4way_init( cc, IV256, salt_zero_small, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
@@ -35,7 +35,9 @@
|
||||
*/
|
||||
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY___
|
||||
#define __BLAKE_HASH_4WAY__
|
||||
|
||||
#ifdef __AVX__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -45,38 +47,36 @@ extern "C"{
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BLAKE-256.
|
||||
*/
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BLAKE-512.
|
||||
*/
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
typedef struct {
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i H[8];
|
||||
__m128i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i H[8];
|
||||
__m128i S[4];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context;
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
|
||||
void blake256_4way_init(void *cc);
|
||||
void blake256_4way(void *cc, const void *data, size_t len);
|
||||
void blake256_4way_close(void *cc, void *dst);
|
||||
void blake256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
106
algo/blake/blakecoin-4way.c
Normal file
106
algo/blake/blakecoin-4way.c
Normal file
@@ -0,0 +1,106 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if defined (__AVX__)
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
blake256r8_4way_context blakecoin_ctx;
|
||||
|
||||
void blakecoin_4way_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r8_4way_context ctx;
|
||||
memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
|
||||
blake256r8_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
|
||||
if (opt_benchmark)
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
blake256r8_4way_init( &blakecoin_ctx );
|
||||
blake256r8_4way( &blakecoin_ctx, vdata, 64 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
|
||||
blakecoin_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
// workaround to prevent flood of hash reports when nonce range exhasuted
|
||||
// and thread is spinning waiting for new work
|
||||
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
|
||||
{
|
||||
*hashes_done = 0;
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
71
algo/blake/blakecoin-gate.c
Normal file
71
algo/blake/blakecoin-gate.c
Normal file
@@ -0,0 +1,71 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
return 0x7ffffLL;
|
||||
// return 0x3fffffLL;
|
||||
}
|
||||
|
||||
// Blakecoin 4 way hashes so fast it runs out of nonces.
|
||||
// This is an attempt to solve this but the result may be
|
||||
// to rehash old nonces until new work is received.
|
||||
void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
|
||||
uint32_t *end_nonce_ptr, bool clean_job )
|
||||
{
|
||||
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
|
||||
//
|
||||
// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
|
||||
// algo_gate.stratum_gen_work( &stratum, g_work );
|
||||
|
||||
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|
||||
|| ( *nonceptr >= *end_nonce_ptr )
|
||||
|| ( work->job_id != g_work->job_id ) && clean_job )
|
||||
/*
|
||||
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|
||||
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|
||||
|| ( work->job_id != g_work->job_id ) ) )
|
||||
*/
|
||||
{
|
||||
work_free( work );
|
||||
work_copy( work, g_work );
|
||||
*nonceptr = 0xffffffffU / opt_n_threads * thr_id;
|
||||
if ( opt_randomize )
|
||||
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
|
||||
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
|
||||
// try incrementing the xnonce to chsnge the data
|
||||
// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
|
||||
}
|
||||
else
|
||||
++(*nonceptr);
|
||||
}
|
||||
|
||||
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKECOIN_4WAY)
|
||||
// four_way_not_tested();
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_4way;
|
||||
gate->hash = (void*)&blakecoin_4way_hash;
|
||||
// gate->get_new_work = (void*)&bc4w_get_new_work;
|
||||
// blakecoin_4way_init( &blake_4way_init_ctx );
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
// blakecoin_init( &blake_init_ctx );
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&blakecoin_get_max64;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
|
21
algo/blake/blakecoin-gate.h
Normal file
21
algo/blake/blakecoin-gate.h
Normal file
@@ -0,0 +1,21 @@
|
||||
#ifndef __BLAKECOIN_GATE_H__
|
||||
#define __BLAKECOIN_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX__)
|
||||
#define BLAKECOIN_4WAY
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
void blakecoin_4way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
void blakecoinhash( void *state, const void *input );
|
||||
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
#endif
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "blakecoin-gate.h"
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
|
||||
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
|
||||
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
|
||||
int64_t blakecoin_get_max64 ()
|
||||
{
|
||||
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
|
@@ -1,5 +1,4 @@
|
||||
#include "decred-gate.h"
|
||||
#include "sph_blake.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -9,110 +8,58 @@
|
||||
#if defined (DECRED_4WAY)
|
||||
|
||||
static __thread blake256_4way_context blake_mid;
|
||||
static __thread bool ctx_midstate_done = false;
|
||||
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[4] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
|
||||
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
|
||||
|
||||
void *tail = input + DECRED_MIDSTATE_LEN;
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
// #define MIDSTATE_LEN 128
|
||||
/*
|
||||
uint8_t *ending = (uint8_t*) input;
|
||||
ending += MIDSTATE_LEN;
|
||||
|
||||
if ( !ctx_midstate_done )
|
||||
{
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
|
||||
ctx_midstate_done = true;
|
||||
}
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
*/
|
||||
|
||||
|
||||
sph_blake256_init( &ctx2 );
|
||||
sph_blake256( &ctx2, sin0, 180 );
|
||||
sph_blake256_close( &ctx2, hash );
|
||||
|
||||
blake256_4way_init( &ctx );
|
||||
blake256_4way( &ctx, input, 180 );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
/*
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( hash[i] != hash0[i] )
|
||||
printf(" hash mismatch, i = %u\n",i);
|
||||
|
||||
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
|
||||
*(hash+2), *(hash+3) );
|
||||
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
|
||||
*(hash0+2), *(hash0+3) );
|
||||
printf("\n");
|
||||
*/
|
||||
|
||||
// memcpy( state, hash0, 32 );
|
||||
// memcpy( state+32, hash1, 32 );
|
||||
// memcpy( state+64, hash1, 32 );
|
||||
// memcpy( state+96, hash1, 32 );
|
||||
|
||||
memcpy( state, hash, 32 );
|
||||
|
||||
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
// uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) edata[48];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
// memcpy(endiandata, pdata, 180);
|
||||
// copy to buffer guaranteed to be aligned.
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
|
||||
mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
* noncep = n;
|
||||
*(noncep+2) = n+1;
|
||||
*(noncep+4) = n+2;
|
||||
*(noncep+6) = n+3;
|
||||
*(noncep+1) = n+1;
|
||||
*(noncep+2) = n+2;
|
||||
*(noncep+3) = n+3;
|
||||
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
// endiandata[DCR_NONCE_OFT32] = n;
|
||||
// decred_hash(hash32, endiandata);
|
||||
|
||||
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
@@ -121,28 +68,28 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
nonces[0] = n;
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
}
|
||||
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n;
|
||||
nonces[1] = n+1;
|
||||
}
|
||||
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n;
|
||||
nonces[2] = n+2;
|
||||
}
|
||||
|
||||
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
*/
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
@@ -1,4 +1,7 @@
|
||||
#include "pentablake-gate.h"
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -9,8 +12,6 @@
|
||||
|
||||
//#define DEBUG_ALGO
|
||||
|
||||
#ifdef PENTABLAKE_4WAY
|
||||
|
||||
extern void pentablakehash_4way( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(32) hash[128];
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX__)
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__)
|
||||
#define PENTABLAKE_4WAY
|
||||
#endif
|
||||
|
||||
|
@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
|
||||
} else {
|
||||
sc->T0 -= 512 - bit_len;
|
||||
}
|
||||
|
||||
if (bit_len <= 446) {
|
||||
memset(u.buf + ptr + 1, 0, 55 - ptr);
|
||||
if (out_size_w32 == 8)
|
||||
|
1146
algo/bmw/bmw-hash-4way.c
Normal file
1146
algo/bmw/bmw-hash-4way.c
Normal file
File diff suppressed because it is too large
Load Diff
95
algo/bmw/bmw-hash-4way.h
Normal file
95
algo/bmw/bmw-hash-4way.h
Normal file
@@ -0,0 +1,95 @@
|
||||
/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* BMW interface. BMW (aka "Blue Midnight Wish") is a family of
|
||||
* functions which differ by their output size; this implementation
|
||||
* defines BMW for output sizes 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_bmw.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef BMW_HASH_H__
|
||||
#define BMW_HASH_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
size_t ptr;
|
||||
sph_u32 bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_4way_small_context;
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
} bmw_4way_big_context;
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -96,34 +96,18 @@ extern "C"{
|
||||
do { \
|
||||
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define Sb(x0, x1, x2, x3, c) do { \
|
||||
x3 = ~x3; \
|
||||
x0 ^= (c) & ~x2; \
|
||||
tmp = (c) ^ (x0 & x1); \
|
||||
x0 ^= x2 & x3; \
|
||||
x3 ^= ~x1 & x2; \
|
||||
x1 ^= x0 & x2; \
|
||||
x2 ^= x0 & ~x3; \
|
||||
x0 ^= x1 | x3; \
|
||||
x3 ^= x1 & x2; \
|
||||
x1 ^= tmp & x0; \
|
||||
x2 ^= tmp; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
do { \
|
||||
x4 = _mm256_xor_si256( x4, x1 ); \
|
||||
@@ -136,20 +120,6 @@ do { \
|
||||
x3 = _mm256_xor_si256( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \
|
||||
x4 ^= x1; \
|
||||
x5 ^= x2; \
|
||||
x6 ^= x3 ^ x0; \
|
||||
x7 ^= x0; \
|
||||
x0 ^= x5; \
|
||||
x1 ^= x6; \
|
||||
x2 ^= x7 ^ x4; \
|
||||
x3 ^= x4; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#if SPH_JH_64
|
||||
|
||||
static const sph_u64 C[] = {
|
||||
|
@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i mask0, mask1;
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vh0 = (__m256i*)vhash0;
|
||||
__m256i* vh1 = (__m256i*)vhash1;
|
||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vhA = (__m256i*)vhashA;
|
||||
__m256i* vhB = (__m256i*)vhashB;
|
||||
__m256i vh_mask;
|
||||
|
||||
blake512_4way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
@@ -40,122 +40,69 @@ void jha_hash_4way( void *out, const void *input )
|
||||
keccak512_4way( &ctx_keccak, input, 80 );
|
||||
keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
|
||||
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
|
||||
// keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
// Heavy & Light Pair Loop
|
||||
for ( int round = 0; round < 3; round++ )
|
||||
{
|
||||
// memset_zero_256( vh0, 20 );
|
||||
// memset_zero_256( vh1, 20 );
|
||||
|
||||
// positive logic, if maski select vhi
|
||||
// going from bit to mask reverses logic such that if the test bit is set
|
||||
// zero will be put in mask0, meaning don't take vh0. mask1 is
|
||||
// inverted so 1 will be put in mask1 meaning take it.
|
||||
mask0 = mm256_negate_64(
|
||||
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
|
||||
mask1 = mm256_not( mask0 );
|
||||
|
||||
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
|
||||
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
|
||||
|
||||
// groestl (serial) v skein
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
|
||||
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
|
||||
(char*)hash0, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
|
||||
(char*)hash1, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
(char*)hash2, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
|
||||
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// skein
|
||||
(char*)hash3, 512 );
|
||||
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way( &ctx_skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx_skein, vhash1 );
|
||||
skein512_4way_close( &ctx_skein, vhashB );
|
||||
|
||||
// merge vectored hash
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
_mm256_and_si256( vh1[i], mask1 ) );
|
||||
/*
|
||||
vha256[i] = _mm256_maskload_epi64(
|
||||
vhasha + i*4, mm256_not( mask ) );
|
||||
vhb256[i] = _mm256_maskload_epi64(
|
||||
vhashb + i*4, mask );
|
||||
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
|
||||
*/
|
||||
}
|
||||
|
||||
// blake v jh
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
||||
blake512_4way_init( &ctx_blake );
|
||||
blake512_4way( &ctx_blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx_blake, vhash0 );
|
||||
blake512_4way_close( &ctx_blake, vhashA );
|
||||
|
||||
jh512_4way_init( &ctx_jh );
|
||||
jh512_4way( &ctx_jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx_jh, vhash1 );
|
||||
jh512_4way_close( &ctx_jh, vhashB );
|
||||
|
||||
// merge hash
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
_mm256_and_si256( vh1[i], mask1 ) );
|
||||
/*
|
||||
vha256[i] = _mm256_maskload_epi64(
|
||||
vhasha + i*4, mm256_not( mask ) );
|
||||
vhb256[i] = _mm256_maskload_epi64(
|
||||
vhashb + i*4, mask );
|
||||
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
|
||||
*/
|
||||
}
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
}
|
||||
|
||||
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
|
||||
|
||||
// memcpy( output, hash0, 32 );
|
||||
// memcpy( output+32, hash1, 32 );
|
||||
// memcpy( output+64, hash2, 32 );
|
||||
// memcpy( output+96, hash3, 32 );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t n = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t n = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
uint64_t htmax[] = {
|
||||
uint64_t htmax[] = {
|
||||
0,
|
||||
0xF,
|
||||
0xFF,
|
||||
@@ -163,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
0xFFFF,
|
||||
0x10000000
|
||||
};
|
||||
uint32_t masks[] = {
|
||||
uint32_t masks[] = {
|
||||
0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
@@ -172,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
0
|
||||
};
|
||||
|
||||
// we need bigendian data...
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// precalc midstate for keccak
|
||||
// keccak512_4way_init( &jha_kec_mid );
|
||||
// keccak512_4way( &jha_kec_mid, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ )
|
||||
{
|
||||
if ( Htarg <= htmax[m] )
|
||||
@@ -196,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
jha_hash_4way( hash, vdata );
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( !(hash[7] & mask) )
|
||||
@@ -234,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
128
algo/lyra2/lyra2h-4way.c
Normal file
128
algo/lyra2/lyra2h-4way.c
Normal file
@@ -0,0 +1,128 @@
|
||||
#include "lyra2h-gate.h"
|
||||
|
||||
#ifdef LYRA2H_4WAY
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
|
||||
__thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
bool lyra2h_4way_thread_init()
|
||||
{
|
||||
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2h_4way_blake_mid;
|
||||
|
||||
void lyra2h_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2h_4way_blake_mid );
|
||||
blake256_4way( &l2h_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2h_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
|
||||
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
lyra2h_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
be32enc( &edata[19], n );
|
||||
lyra2h_4way_hash( hash, vdata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
25
algo/lyra2/lyra2h-gate.c
Normal file
25
algo/lyra2/lyra2h-gate.c
Normal file
@@ -0,0 +1,25 @@
|
||||
#include "lyra2h-gate.h"
|
||||
#include "lyra2.h"
|
||||
|
||||
void lyra2h_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef LYRA2H_4WAY
|
||||
gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2h_4way;
|
||||
gate->hash = (void*)&lyra2h_4way_hash;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&lyra2h_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2h;
|
||||
gate->hash = (void*)&lyra2h_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2h_set_target;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/lyra2/lyra2h-gate.h
Normal file
32
algo/lyra2/lyra2h-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef LYRA2H_GATE_H__
|
||||
#define LYRA2H_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY)
|
||||
#define LYRA2H_4WAY
|
||||
#endif
|
||||
|
||||
#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8
|
||||
|
||||
#if defined(LYRA2H_4WAY)
|
||||
|
||||
void lyra2h_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool lyra2h_4way_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
void lyra2h_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool lyra2h_thread_init();
|
||||
|
||||
#endif
|
||||
|
75
algo/lyra2/lyra2h.c
Normal file
75
algo/lyra2/lyra2h.c
Normal file
@@ -0,0 +1,75 @@
|
||||
#include "lyra2h-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
|
||||
__thread uint64_t* lyra2h_matrix;
|
||||
|
||||
bool lyra2h_thread_init()
|
||||
{
|
||||
lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
|
||||
return lyra2h_matrix;
|
||||
}
|
||||
|
||||
static __thread sph_blake256_context lyra2h_blake_mid;
|
||||
|
||||
void lyra2h_midstate( const void* input )
|
||||
{
|
||||
sph_blake256_init( &lyra2h_blake_mid );
|
||||
sph_blake256( &lyra2h_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2h_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
|
||||
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
|
||||
sph_blake256( &ctx_blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 16, 16, 16 );
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
lyra2h_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2h_hash( hash, endiandata );
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
177
algo/lyra2/lyra2rev2-4way.c
Normal file
177
algo/lyra2/lyra2rev2-4way.c
Normal file
@@ -0,0 +1,177 @@
|
||||
#include "lyra2rev2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
sph_bmw256_context bmw;
|
||||
|
||||
} lyra2v2_4way_ctx_holder;
|
||||
|
||||
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
|
||||
|
||||
void init_lyra2rev2_4way_ctx()
|
||||
{
|
||||
// blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
keccak256_4way_init( &l2v2_4way_ctx.keccak );
|
||||
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &l2v2_4way_ctx.skein );
|
||||
sph_bmw256_init( &l2v2_4way_ctx.bmw );
|
||||
}
|
||||
|
||||
void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
|
||||
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
||||
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
// blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
|
||||
mm256_reinterleave_4x64( vhash64, vhash, 256 );
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
|
||||
|
||||
sph_bmw256( &ctx.bmw, hash0, 32 );
|
||||
sph_bmw256_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
|
||||
sph_bmw256( &ctx.bmw, hash1, 32 );
|
||||
sph_bmw256_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
|
||||
sph_bmw256( &ctx.bmw, hash2, 32 );
|
||||
sph_bmw256_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
|
||||
sph_bmw256( &ctx.bmw, hash3, 32 );
|
||||
sph_bmw256_close( &ctx.bmw, hash3 );
|
||||
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
lyra2rev2_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
38
algo/lyra2/lyra2rev2-gate.c
Normal file
38
algo/lyra2/lyra2rev2-gate.c
Normal file
@@ -0,0 +1,38 @@
|
||||
#include "lyra2rev2-gate.h"
|
||||
|
||||
__thread uint64_t* l2v2_wholeMatrix;
|
||||
|
||||
void lyra2rev2_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool lyra2rev2_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
l2v2_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
return l2v2_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
init_lyra2rev2_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
|
||||
gate->hash = (void*)&lyra2rev2_4way_hash;
|
||||
#else
|
||||
init_lyra2rev2_ctx();
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
gate->set_target = (void*)&lyra2rev2_set_target;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
35
algo/lyra2/lyra2rev2-gate.h
Normal file
35
algo/lyra2/lyra2rev2-gate.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef LYRA2REV2_GATE_H__
|
||||
#define LYRA2REV2_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(HASH_4WAY)
|
||||
#define LYRA2REV2_4WAY
|
||||
#endif
|
||||
|
||||
extern __thread uint64_t* l2v2_wholeMatrix;
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(LYRA2REV2_4WAY)
|
||||
|
||||
void lyra2rev2_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_lyra2rev2_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void lyra2rev2_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_lyra2rev2_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,20 +1,12 @@
|
||||
#include "lyra2rev2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "lyra2.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
// This gets allocated when miner_thread starts up and is never freed.
|
||||
// It's not a leak because the only way to allocate it again is to exit
|
||||
// the thread and that only occurs when the entire program exits.
|
||||
__thread uint64_t* l2v2_wholeMatrix;
|
||||
//#include "lyra2.h"
|
||||
|
||||
typedef struct {
|
||||
cubehashParam cube1;
|
||||
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
|
||||
if( fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void lyra2rev2_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool lyra2rev2_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
l2v2_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
return l2v2_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2rev2_ctx();
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
gate->set_target = (void*)&lyra2rev2_set_target;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,13 +4,10 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
//#include "avxdefs.h"
|
||||
|
||||
// same size, only difference is the name, lyra2 is done serially
|
||||
__thread uint64_t* lyra2z_4way_matrix;
|
||||
|
||||
bool lyra2z_4way_thread_init()
|
||||
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
|
||||
blake256_4way( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void lyra2z_4way_hash( void *state, const void *input )
|
||||
{
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
// blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
// blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
blake256_4way_init( &ctx_blake );
|
||||
blake256_4way( &ctx_blake, input, 80 );
|
||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
// memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// lyra2z_4way_midstate( vdata );
|
||||
lyra2z_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
@@ -99,46 +85,38 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
be32enc( &edata[19], n );
|
||||
lyra2z_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
printf("found 0\n");
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
*/
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
printf("found 2\n");
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
/*
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
printf("found 3\n");
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
*/
|
||||
n += 2;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
@@ -148,21 +126,3 @@ printf("found 3\n");
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
|
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef LYRA2Z_4WAY
|
||||
four_way_not_tested();
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
gate->hash = (void*)&lyra2z_4way_hash;
|
||||
#else
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2z_set_target;
|
||||
return true;
|
||||
|
@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
|
||||
|
||||
void lyra2z_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
|
||||
{
|
||||
work->height = sctx->bloc_height;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool lyra2z_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
return lyra2z_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2z_set_target;
|
||||
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotl256_1x64( s1); \
|
||||
s1 = mm256_rotr256_1x64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotr256_1x64( s3 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotr256_1x64( s1 ); \
|
||||
s1 = mm256_rotl256_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 );
|
||||
s3 = mm256_rotr256_1x64( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
|
@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
|
||||
hash_str,
|
||||
target_str);
|
||||
}
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = data[19];
|
||||
goto out;
|
||||
}
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
bool register_nist5_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
#if defined (NIST5_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_4way;
|
||||
gate->hash = (void*)&nist5hash_4way;
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define NIST5_4WAY
|
||||
#endif
|
||||
|
||||
|
@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
|
@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
|
||||
pdata[0] = tmpdata[0];
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash );
|
||||
if (opt_debug)
|
||||
applog(LOG_INFO, "found nonce %x", nonce);
|
||||
return 1;
|
@@ -1,12 +0,0 @@
|
||||
#ifndef __POLYTIMOS_GATE_H__
|
||||
#define __POLYTIMOS_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
void polytimos_hash( void *state, const void *input );
|
||||
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
void init_polytimos_context();
|
||||
|
||||
#endif
|
207
algo/quark/quark-4way.c
Normal file
207
algo/quark/quark-4way.c
Normal file
@@ -0,0 +1,207 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "quark-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_4way_context jh;
|
||||
skein512_4way_context skein;
|
||||
keccak512_4way_context keccak;
|
||||
} quark_4way_ctx_holder;
|
||||
|
||||
quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_quark_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &quark_4way_ctx.blake );
|
||||
bmw512_4way_init( &quark_4way_ctx.bmw );
|
||||
init_groestl( &quark_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &quark_4way_ctx.skein );
|
||||
jh512_4way_init( &quark_4way_ctx.jh );
|
||||
keccak512_4way_init( &quark_4way_ctx.keccak );
|
||||
}
|
||||
|
||||
void quark_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vhA = (__m256i*)vhashA;
|
||||
__m256i* vhB = (__m256i*)vhashB;
|
||||
__m256i vh_mask;
|
||||
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
int i;
|
||||
quark_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
|
||||
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhashA );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
|
||||
mm256_zero );
|
||||
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
||||
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
quark_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
17
algo/quark/quark-gate.c
Normal file
17
algo/quark/quark-gate.c
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "quark-gate.h"
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (QUARK_4WAY)
|
||||
init_quark_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark_4way;
|
||||
gate->hash = (void*)&quark_4way_hash;
|
||||
#else
|
||||
init_quark_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/quark/quark-gate.h
Normal file
32
algo/quark/quark-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef QUARK_GATE_H__
|
||||
#define QUARK_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define QUARK_4WAY
|
||||
#endif
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(QUARK_4WAY)
|
||||
|
||||
void quark_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_quark_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void quark_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_quark_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "quark-gate.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -47,7 +47,7 @@ void init_quark_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void quarkhash(void *state, const void *input)
|
||||
void quark_hash(void *state, const void *input)
|
||||
{
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
quarkhash(hash64, &endiandata);
|
||||
quark_hash(hash64, &endiandata);
|
||||
if ((hash64[7]&0xFFFFFF00)==0)
|
||||
{
|
||||
if (fulltest(hash64, ptarget))
|
||||
{
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_quark_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quarkhash;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,31 +1,20 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_context echo;
|
||||
#else
|
||||
@@ -133,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
|
@@ -1,23 +1,16 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
@@ -141,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
|
@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
|
||||
*hashes_done = n - pdata[19] + 1;
|
||||
pdata[19] = data[i * 20 + 19];
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
@@ -114,7 +114,7 @@ available_implementations() {
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
@@ -145,4 +145,4 @@ scrypt_test_mix() {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
*/
|
||||
|
@@ -26,7 +26,7 @@
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
|
||||
/*
|
||||
static int
|
||||
scrypt_test_hash() {
|
||||
scrypt_hash_state st;
|
||||
@@ -45,4 +45,4 @@ scrypt_test_hash() {
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
*/
|
||||
|
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input, uint32_t len)
|
||||
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
|
||||
|
||||
SHA256_Update( &ctx_sha256, input + midlen, tail );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
|
||||
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
|
||||
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
#else
|
||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
|
||||
|
618
algo/shabal/shabal-hash-4way.c
Normal file
618
algo/shabal/shabal-hash-4way.c
Normal file
@@ -0,0 +1,618 @@
|
||||
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
|
||||
/*
|
||||
* Shabal implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "shabal-hash-4way.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Part of this code was automatically generated (the part between
|
||||
* the "BEGIN" and "END" markers).
|
||||
*/
|
||||
|
||||
#define sM 16
|
||||
|
||||
#define C32 SPH_C32
|
||||
#define T32 SPH_T32
|
||||
|
||||
#define O1 13
|
||||
#define O2 9
|
||||
#define O3 6
|
||||
|
||||
/*
|
||||
* We copy the state into local variables, so that the compiler knows
|
||||
* that it can optimize them at will.
|
||||
*/
|
||||
|
||||
/* BEGIN -- automatically generated code. */
|
||||
|
||||
#define DECL_STATE \
|
||||
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
|
||||
A08, A09, A0A, A0B; \
|
||||
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE(state) do { \
|
||||
A00 = (state)->A[0]; \
|
||||
A01 = (state)->A[1]; \
|
||||
A02 = (state)->A[2]; \
|
||||
A03 = (state)->A[3]; \
|
||||
A04 = (state)->A[4]; \
|
||||
A05 = (state)->A[5]; \
|
||||
A06 = (state)->A[6]; \
|
||||
A07 = (state)->A[7]; \
|
||||
A08 = (state)->A[8]; \
|
||||
A09 = (state)->A[9]; \
|
||||
A0A = (state)->A[10]; \
|
||||
A0B = (state)->A[11]; \
|
||||
B0 = (state)->B[0]; \
|
||||
B1 = (state)->B[1]; \
|
||||
B2 = (state)->B[2]; \
|
||||
B3 = (state)->B[3]; \
|
||||
B4 = (state)->B[4]; \
|
||||
B5 = (state)->B[5]; \
|
||||
B6 = (state)->B[6]; \
|
||||
B7 = (state)->B[7]; \
|
||||
B8 = (state)->B[8]; \
|
||||
B9 = (state)->B[9]; \
|
||||
BA = (state)->B[10]; \
|
||||
BB = (state)->B[11]; \
|
||||
BC = (state)->B[12]; \
|
||||
BD = (state)->B[13]; \
|
||||
BE = (state)->B[14]; \
|
||||
BF = (state)->B[15]; \
|
||||
C0 = (state)->C[0]; \
|
||||
C1 = (state)->C[1]; \
|
||||
C2 = (state)->C[2]; \
|
||||
C3 = (state)->C[3]; \
|
||||
C4 = (state)->C[4]; \
|
||||
C5 = (state)->C[5]; \
|
||||
C6 = (state)->C[6]; \
|
||||
C7 = (state)->C[7]; \
|
||||
C8 = (state)->C[8]; \
|
||||
C9 = (state)->C[9]; \
|
||||
CA = (state)->C[10]; \
|
||||
CB = (state)->C[11]; \
|
||||
CC = (state)->C[12]; \
|
||||
CD = (state)->C[13]; \
|
||||
CE = (state)->C[14]; \
|
||||
CF = (state)->C[15]; \
|
||||
Wlow = (state)->Wlow; \
|
||||
Whigh = (state)->Whigh; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE(state) do { \
|
||||
(state)->A[0] = A00; \
|
||||
(state)->A[1] = A01; \
|
||||
(state)->A[2] = A02; \
|
||||
(state)->A[3] = A03; \
|
||||
(state)->A[4] = A04; \
|
||||
(state)->A[5] = A05; \
|
||||
(state)->A[6] = A06; \
|
||||
(state)->A[7] = A07; \
|
||||
(state)->A[8] = A08; \
|
||||
(state)->A[9] = A09; \
|
||||
(state)->A[10] = A0A; \
|
||||
(state)->A[11] = A0B; \
|
||||
(state)->B[0] = B0; \
|
||||
(state)->B[1] = B1; \
|
||||
(state)->B[2] = B2; \
|
||||
(state)->B[3] = B3; \
|
||||
(state)->B[4] = B4; \
|
||||
(state)->B[5] = B5; \
|
||||
(state)->B[6] = B6; \
|
||||
(state)->B[7] = B7; \
|
||||
(state)->B[8] = B8; \
|
||||
(state)->B[9] = B9; \
|
||||
(state)->B[10] = BA; \
|
||||
(state)->B[11] = BB; \
|
||||
(state)->B[12] = BC; \
|
||||
(state)->B[13] = BD; \
|
||||
(state)->B[14] = BE; \
|
||||
(state)->B[15] = BF; \
|
||||
(state)->C[0] = C0; \
|
||||
(state)->C[1] = C1; \
|
||||
(state)->C[2] = C2; \
|
||||
(state)->C[3] = C3; \
|
||||
(state)->C[4] = C4; \
|
||||
(state)->C[5] = C5; \
|
||||
(state)->C[6] = C6; \
|
||||
(state)->C[7] = C7; \
|
||||
(state)->C[8] = C8; \
|
||||
(state)->C[9] = C9; \
|
||||
(state)->C[10] = CA; \
|
||||
(state)->C[11] = CB; \
|
||||
(state)->C[12] = CC; \
|
||||
(state)->C[13] = CD; \
|
||||
(state)->C[14] = CE; \
|
||||
(state)->C[15] = CF; \
|
||||
(state)->Wlow = Wlow; \
|
||||
(state)->Whigh = Whigh; \
|
||||
} while (0)
|
||||
|
||||
#define DECODE_BLOCK \
|
||||
do { \
|
||||
M0 = buf[ 0]; \
|
||||
M1 = buf[ 1]; \
|
||||
M2 = buf[ 2]; \
|
||||
M3 = buf[ 3]; \
|
||||
M4 = buf[ 4]; \
|
||||
M5 = buf[ 5]; \
|
||||
M6 = buf[ 6]; \
|
||||
M7 = buf[ 7]; \
|
||||
M8 = buf[ 8]; \
|
||||
M9 = buf[ 9]; \
|
||||
MA = buf[10]; \
|
||||
MB = buf[11]; \
|
||||
MC = buf[12]; \
|
||||
MD = buf[13]; \
|
||||
ME = buf[14]; \
|
||||
MF = buf[15]; \
|
||||
} while (0)
|
||||
|
||||
#define INPUT_BLOCK_ADD \
|
||||
do { \
|
||||
B0 = _mm_add_epi32( B0, M0 );\
|
||||
B1 = _mm_add_epi32( B1, M1 );\
|
||||
B2 = _mm_add_epi32( B2, M2 );\
|
||||
B3 = _mm_add_epi32( B3, M3 );\
|
||||
B4 = _mm_add_epi32( B4, M4 );\
|
||||
B5 = _mm_add_epi32( B5, M5 );\
|
||||
B6 = _mm_add_epi32( B6, M6 );\
|
||||
B7 = _mm_add_epi32( B7, M7 );\
|
||||
B8 = _mm_add_epi32( B8, M8 );\
|
||||
B9 = _mm_add_epi32( B9, M9 );\
|
||||
BA = _mm_add_epi32( BA, MA );\
|
||||
BB = _mm_add_epi32( BB, MB );\
|
||||
BC = _mm_add_epi32( BC, MC );\
|
||||
BD = _mm_add_epi32( BD, MD );\
|
||||
BE = _mm_add_epi32( BE, ME );\
|
||||
BF = _mm_add_epi32( BF, MF );\
|
||||
} while (0)
|
||||
|
||||
#define INPUT_BLOCK_SUB \
|
||||
do { \
|
||||
C0 = _mm_sub_epi32( C0, M0 ); \
|
||||
C1 = _mm_sub_epi32( C1, M1 ); \
|
||||
C2 = _mm_sub_epi32( C2, M2 ); \
|
||||
C3 = _mm_sub_epi32( C3, M3 ); \
|
||||
C4 = _mm_sub_epi32( C4, M4 ); \
|
||||
C5 = _mm_sub_epi32( C5, M5 ); \
|
||||
C6 = _mm_sub_epi32( C6, M6 ); \
|
||||
C7 = _mm_sub_epi32( C7, M7 ); \
|
||||
C8 = _mm_sub_epi32( C8, M8 ); \
|
||||
C9 = _mm_sub_epi32( C9, M9 ); \
|
||||
CA = _mm_sub_epi32( CA, MA ); \
|
||||
CB = _mm_sub_epi32( CB, MB ); \
|
||||
CC = _mm_sub_epi32( CC, MC ); \
|
||||
CD = _mm_sub_epi32( CD, MD ); \
|
||||
CE = _mm_sub_epi32( CE, ME ); \
|
||||
CF = _mm_sub_epi32( CF, MF ); \
|
||||
} while (0)
|
||||
|
||||
#define XOR_W \
|
||||
do { \
|
||||
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
|
||||
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
/*
|
||||
#define SWAP(v1, v2) do { \
|
||||
sph_u32 tmp = (v1); \
|
||||
(v1) = (v2); \
|
||||
(v2) = tmp; \
|
||||
} while (0)
|
||||
*/
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
mm_swap_128( B0, C0 ); \
|
||||
mm_swap_128( B1, C1 ); \
|
||||
mm_swap_128( B2, C2 ); \
|
||||
mm_swap_128( B3, C3 ); \
|
||||
mm_swap_128( B4, C4 ); \
|
||||
mm_swap_128( B5, C5 ); \
|
||||
mm_swap_128( B6, C6 ); \
|
||||
mm_swap_128( B7, C7 ); \
|
||||
mm_swap_128( B8, C8 ); \
|
||||
mm_swap_128( B9, C9 ); \
|
||||
mm_swap_128( BA, CA ); \
|
||||
mm_swap_128( BB, CB ); \
|
||||
mm_swap_128( BC, CC ); \
|
||||
mm_swap_128( BD, CD ); \
|
||||
mm_swap_128( BE, CE ); \
|
||||
mm_swap_128( BF, CF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
do { \
|
||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||
_mm_andnot_si128( xb3, xb2 ), \
|
||||
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
||||
_mm_mullo_epi32( mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
|
||||
) ), _mm_set1_epi32(3UL) ) ) ) ); \
|
||||
xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0 do { \
|
||||
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_1 do { \
|
||||
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_2 do { \
|
||||
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define APPLY_P \
|
||||
do { \
|
||||
B0 = mm_rotr_32( B0, 15 ); \
|
||||
B1 = mm_rotr_32( B1, 15 ); \
|
||||
B2 = mm_rotr_32( B2, 15 ); \
|
||||
B3 = mm_rotr_32( B3, 15 ); \
|
||||
B4 = mm_rotr_32( B4, 15 ); \
|
||||
B5 = mm_rotr_32( B5, 15 ); \
|
||||
B6 = mm_rotr_32( B6, 15 ); \
|
||||
B7 = mm_rotr_32( B7, 15 ); \
|
||||
B8 = mm_rotr_32( B8, 15 ); \
|
||||
B9 = mm_rotr_32( B9, 15 ); \
|
||||
BA = mm_rotr_32( BA, 15 ); \
|
||||
BB = mm_rotr_32( BB, 15 ); \
|
||||
BC = mm_rotr_32( BC, 15 ); \
|
||||
BD = mm_rotr_32( BD, 15 ); \
|
||||
BE = mm_rotr_32( BE, 15 ); \
|
||||
BF = mm_rotr_32( BF, 15 ); \
|
||||
PERM_STEP_0; \
|
||||
PERM_STEP_1; \
|
||||
PERM_STEP_2; \
|
||||
A0B = _mm_add_epi32( A0B, C6 ); \
|
||||
A0A = _mm_add_epi32( A0A, C5 ); \
|
||||
A09 = _mm_add_epi32( A09, C4 ); \
|
||||
A08 = _mm_add_epi32( A08, C3 ); \
|
||||
A07 = _mm_add_epi32( A07, C2 ); \
|
||||
A06 = _mm_add_epi32( A06, C1 ); \
|
||||
A05 = _mm_add_epi32( A05, C0 ); \
|
||||
A04 = _mm_add_epi32( A04, CF ); \
|
||||
A03 = _mm_add_epi32( A03, CE ); \
|
||||
A02 = _mm_add_epi32( A02, CD ); \
|
||||
A01 = _mm_add_epi32( A01, CC ); \
|
||||
A00 = _mm_add_epi32( A00, CB ); \
|
||||
A0B = _mm_add_epi32( A0B, CA ); \
|
||||
A0A = _mm_add_epi32( A0A, C9 ); \
|
||||
A09 = _mm_add_epi32( A09, C8 ); \
|
||||
A08 = _mm_add_epi32( A08, C7 ); \
|
||||
A07 = _mm_add_epi32( A07, C6 ); \
|
||||
A06 = _mm_add_epi32( A06, C5 ); \
|
||||
A05 = _mm_add_epi32( A05, C4 ); \
|
||||
A04 = _mm_add_epi32( A04, C3 ); \
|
||||
A03 = _mm_add_epi32( A03, C2 ); \
|
||||
A02 = _mm_add_epi32( A02, C1 ); \
|
||||
A01 = _mm_add_epi32( A01, C0 ); \
|
||||
A00 = _mm_add_epi32( A00, CF ); \
|
||||
A0B = _mm_add_epi32( A0B, CE ); \
|
||||
A0A = _mm_add_epi32( A0A, CD ); \
|
||||
A09 = _mm_add_epi32( A09, CC ); \
|
||||
A08 = _mm_add_epi32( A08, CB ); \
|
||||
A07 = _mm_add_epi32( A07, CA ); \
|
||||
A06 = _mm_add_epi32( A06, C9 ); \
|
||||
A05 = _mm_add_epi32( A05, C8 ); \
|
||||
A04 = _mm_add_epi32( A04, C7 ); \
|
||||
A03 = _mm_add_epi32( A03, C6 ); \
|
||||
A02 = _mm_add_epi32( A02, C5 ); \
|
||||
A01 = _mm_add_epi32( A01, C4 ); \
|
||||
A00 = _mm_add_epi32( A00, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W do { \
|
||||
if ((Wlow = T32(Wlow + 1)) == 0) \
|
||||
Whigh = T32(Whigh + 1); \
|
||||
} while (0)
|
||||
|
||||
static const sph_u32 A_init_256[] = {
|
||||
C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
|
||||
C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
|
||||
C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
|
||||
};
|
||||
|
||||
static const sph_u32 B_init_256[] = {
|
||||
C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
|
||||
C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
|
||||
C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
|
||||
C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
|
||||
};
|
||||
|
||||
static const sph_u32 C_init_256[] = {
|
||||
C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
|
||||
C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
|
||||
C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
|
||||
C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
|
||||
};
|
||||
|
||||
static const sph_u32 A_init_512[] = {
|
||||
C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
|
||||
C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
|
||||
C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
|
||||
};
|
||||
|
||||
static const sph_u32 B_init_512[] = {
|
||||
C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
|
||||
C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
|
||||
C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
|
||||
C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
|
||||
};
|
||||
|
||||
static const sph_u32 C_init_512[] = {
|
||||
C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
|
||||
C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
|
||||
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
|
||||
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
|
||||
};
|
||||
|
||||
static void
|
||||
shabal_4way_init( void *cc, unsigned size )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
int i;
|
||||
|
||||
if ( size == 512 )
|
||||
{
|
||||
for ( i = 0; i < 12; i++ )
|
||||
sc->A[i] = _mm_set1_epi32( A_init_512[i] );
|
||||
for ( i = 0; i < 16; i++ )
|
||||
{
|
||||
sc->B[i] = _mm_set1_epi32( B_init_512[i] );
|
||||
sc->C[i] = _mm_set1_epi32( C_init_512[i] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 12; i++ )
|
||||
sc->A[i] = _mm_set1_epi32( A_init_256[i] );
|
||||
for ( i = 0; i < 16; i++ )
|
||||
{
|
||||
sc->B[i] = _mm_set1_epi32( B_init_256[i] );
|
||||
sc->C[i] = _mm_set1_epi32( C_init_256[i] );
|
||||
}
|
||||
}
|
||||
sc->Wlow = 1;
|
||||
sc->Whigh = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
shabal_4way_core( void *cc, const unsigned char *data, size_t len )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
__m128i *buf;
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
const int buf_size = 64;
|
||||
size_t ptr;
|
||||
DECL_STATE
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < (buf_size - ptr ) )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
READ_STATE(sc);
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
|
||||
|
||||
ptr += clen;
|
||||
vdata += clen>>2;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
DECODE_BLOCK;
|
||||
INPUT_BLOCK_ADD;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
INPUT_BLOCK_SUB;
|
||||
SWAP_BC;
|
||||
INCR_W;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
|
||||
unsigned size_words )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
__m128i *buf;
|
||||
const int buf_size = 64;
|
||||
size_t ptr;
|
||||
int i;
|
||||
unsigned z, zz;
|
||||
DECL_STATE
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>2] = _mm_set1_epi32( zz );
|
||||
memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
|
||||
READ_STATE(sc);
|
||||
DECODE_BLOCK;
|
||||
INPUT_BLOCK_ADD;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
|
||||
for ( i = 0; i < 3; i ++ )
|
||||
{
|
||||
SWAP_BC;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
}
|
||||
|
||||
__m128i *d = (__m128i*)dst;
|
||||
if ( size_words == 16 ) // 512
|
||||
{
|
||||
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
|
||||
d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
|
||||
d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
|
||||
d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
|
||||
}
|
||||
else // 256
|
||||
{
|
||||
d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
|
||||
d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_init( void *cc )
|
||||
{
|
||||
shabal_4way_init(cc, 256);
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way( void *cc, const void *data, size_t len )
|
||||
{
|
||||
shabal_4way_core( cc, data, len );
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_close( void *cc, void *dst )
|
||||
{
|
||||
shabal_4way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst )
|
||||
{
|
||||
shabal_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_init(void *cc)
|
||||
{
|
||||
shabal_4way_init(cc, 512);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shabal_4way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
shabal_4way_close(cc, 0, 0, dst, 16);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
shabal_4way_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
82
algo/shabal/shabal-hash-4way.h
Normal file
82
algo/shabal/shabal-hash-4way.h
Normal file
@@ -0,0 +1,82 @@
|
||||
/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
|
||||
/**
|
||||
* Shabal interface. Shabal is a family of functions which differ by
|
||||
* their output size; this implementation defines Shabal for output
|
||||
* sizes 192, 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_shabal.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SHABAL_HASH_4WAY_H__
|
||||
#define SHABAL_HASH_4WAY_H__ 1
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#define SPH_SIZE_shabal256 256
|
||||
|
||||
#define SPH_SIZE_shabal512 512
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i A[12], B[16], C[16];
|
||||
sph_u32 Whigh, Wlow;
|
||||
size_t ptr;
|
||||
} shabal_4way_context;
|
||||
|
||||
typedef shabal_4way_context shabal256_4way_context;
|
||||
typedef shabal_4way_context shabal512_4way_context;
|
||||
|
||||
void shabal256_4way_init( void *cc );
|
||||
void shabal256_4way( void *cc, const void *data, size_t len );
|
||||
void shabal256_4way_close( void *cc, void *dst );
|
||||
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
void shabal512_4way_init( void *cc );
|
||||
void shabal512_4way( void *cc, const void *data, size_t len );
|
||||
void shabal512_4way_close( void *cc, void *dst );
|
||||
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
670
algo/shavite/sph-shavite-aesni.c
Normal file
670
algo/shavite/sph-shavite-aesni.c
Normal file
@@ -0,0 +1,670 @@
|
||||
/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
|
||||
/*
|
||||
* SHAvite-3 implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __AES__
|
||||
|
||||
#include "sph_shavite.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
|
||||
#define SPH_SMALL_FOOTPRINT_SHAVITE 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#define C32 SPH_C32
|
||||
|
||||
/*
|
||||
* As of round 2 of the SHA-3 competition, the published reference
|
||||
* implementation and test vectors are wrong, because they use
|
||||
* big-endian AES tables while the internal decoding uses little-endian.
|
||||
* The code below follows the specification. To turn it into a code
|
||||
* which follows the reference implementation (the one called "BugFix"
|
||||
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
|
||||
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
|
||||
* of the AES_ROUND_NOKEY macro) and replace it with the version which
|
||||
* is commented out afterwards.
|
||||
*/
|
||||
|
||||
#define AES_BIG_ENDIAN 0
|
||||
#include "algo/sha/aes_helper.c"
|
||||
|
||||
static const sph_u32 IV512[] = {
|
||||
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
|
||||
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
|
||||
C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
|
||||
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
||||
};
|
||||
|
||||
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
|
||||
sph_u32 t0 = (x0); \
|
||||
sph_u32 t1 = (x1); \
|
||||
sph_u32 t2 = (x2); \
|
||||
sph_u32 t3 = (x3); \
|
||||
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
|
||||
sph_u32 kt; \
|
||||
AES_ROUND_NOKEY(k1, k2, k3, k0); \
|
||||
kt = (k0); \
|
||||
(k0) = (k1); \
|
||||
(k1) = (k2); \
|
||||
(k2) = (k3); \
|
||||
(k3) = kt; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_SHAVITE
|
||||
|
||||
/*
|
||||
* This function assumes that "msg" is aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
c512(sph_shavite_big_context *sc, const void *msg)
|
||||
{
|
||||
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
|
||||
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
|
||||
sph_u32 rk[448];
|
||||
size_t u;
|
||||
int r, s;
|
||||
|
||||
#if SPH_LITTLE_ENDIAN
|
||||
memcpy(rk, msg, 128);
|
||||
#else
|
||||
for (u = 0; u < 32; u += 4) {
|
||||
rk[u + 0] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 0);
|
||||
rk[u + 1] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 4);
|
||||
rk[u + 2] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 8);
|
||||
rk[u + 3] = sph_dec32le_aligned(
|
||||
(const unsigned char *)msg + (u << 2) + 12);
|
||||
}
|
||||
#endif
|
||||
u = 32;
|
||||
for (;;) {
|
||||
for (s = 0; s < 4; s ++) {
|
||||
sph_u32 x0, x1, x2, x3;
|
||||
|
||||
x0 = rk[u - 31];
|
||||
x1 = rk[u - 30];
|
||||
x2 = rk[u - 29];
|
||||
x3 = rk[u - 32];
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
||||
rk[u + 0] = x0 ^ rk[u - 4];
|
||||
rk[u + 1] = x1 ^ rk[u - 3];
|
||||
rk[u + 2] = x2 ^ rk[u - 2];
|
||||
rk[u + 3] = x3 ^ rk[u - 1];
|
||||
if (u == 32) {
|
||||
rk[ 32] ^= sc->count0;
|
||||
rk[ 33] ^= sc->count1;
|
||||
rk[ 34] ^= sc->count2;
|
||||
rk[ 35] ^= SPH_T32(~sc->count3);
|
||||
} else if (u == 440) {
|
||||
rk[440] ^= sc->count1;
|
||||
rk[441] ^= sc->count0;
|
||||
rk[442] ^= sc->count3;
|
||||
rk[443] ^= SPH_T32(~sc->count2);
|
||||
}
|
||||
u += 4;
|
||||
|
||||
x0 = rk[u - 31];
|
||||
x1 = rk[u - 30];
|
||||
x2 = rk[u - 29];
|
||||
x3 = rk[u - 32];
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
||||
rk[u + 0] = x0 ^ rk[u - 4];
|
||||
rk[u + 1] = x1 ^ rk[u - 3];
|
||||
rk[u + 2] = x2 ^ rk[u - 2];
|
||||
rk[u + 3] = x3 ^ rk[u - 1];
|
||||
if (u == 164) {
|
||||
rk[164] ^= sc->count3;
|
||||
rk[165] ^= sc->count2;
|
||||
rk[166] ^= sc->count1;
|
||||
rk[167] ^= SPH_T32(~sc->count0);
|
||||
} else if (u == 316) {
|
||||
rk[316] ^= sc->count2;
|
||||
rk[317] ^= sc->count3;
|
||||
rk[318] ^= sc->count0;
|
||||
rk[319] ^= SPH_T32(~sc->count1);
|
||||
}
|
||||
u += 4;
|
||||
}
|
||||
if (u == 448)
|
||||
break;
|
||||
for (s = 0; s < 8; s ++) {
|
||||
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
|
||||
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
|
||||
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
|
||||
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
|
||||
u += 4;
|
||||
}
|
||||
}
|
||||
|
||||
p0 = sc->h[0x0];
|
||||
p1 = sc->h[0x1];
|
||||
p2 = sc->h[0x2];
|
||||
p3 = sc->h[0x3];
|
||||
p4 = sc->h[0x4];
|
||||
p5 = sc->h[0x5];
|
||||
p6 = sc->h[0x6];
|
||||
p7 = sc->h[0x7];
|
||||
p8 = sc->h[0x8];
|
||||
p9 = sc->h[0x9];
|
||||
pA = sc->h[0xA];
|
||||
pB = sc->h[0xB];
|
||||
pC = sc->h[0xC];
|
||||
pD = sc->h[0xD];
|
||||
pE = sc->h[0xE];
|
||||
pF = sc->h[0xF];
|
||||
u = 0;
|
||||
for (r = 0; r < 14; r ++) {
|
||||
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
|
||||
sph_u32 x0, x1, x2, x3; \
|
||||
x0 = r0 ^ rk[u ++]; \
|
||||
x1 = r1 ^ rk[u ++]; \
|
||||
x2 = r2 ^ rk[u ++]; \
|
||||
x3 = r3 ^ rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
x0 ^= rk[u ++]; \
|
||||
x1 ^= rk[u ++]; \
|
||||
x2 ^= rk[u ++]; \
|
||||
x3 ^= rk[u ++]; \
|
||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
||||
l0 ^= x0; \
|
||||
l1 ^= x1; \
|
||||
l2 ^= x2; \
|
||||
l3 ^= x3; \
|
||||
} while (0)
|
||||
|
||||
#define WROT(a, b, c, d) do { \
|
||||
sph_u32 t = d; \
|
||||
d = c; \
|
||||
c = b; \
|
||||
b = a; \
|
||||
a = t; \
|
||||
} while (0)
|
||||
|
||||
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
|
||||
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
|
||||
|
||||
WROT(p0, p4, p8, pC);
|
||||
WROT(p1, p5, p9, pD);
|
||||
WROT(p2, p6, pA, pE);
|
||||
WROT(p3, p7, pB, pF);
|
||||
|
||||
#undef C512_ELT
|
||||
#undef WROT
|
||||
}
|
||||
sc->h[0x0] ^= p0;
|
||||
sc->h[0x1] ^= p1;
|
||||
sc->h[0x2] ^= p2;
|
||||
sc->h[0x3] ^= p3;
|
||||
sc->h[0x4] ^= p4;
|
||||
sc->h[0x5] ^= p5;
|
||||
sc->h[0x6] ^= p6;
|
||||
sc->h[0x7] ^= p7;
|
||||
sc->h[0x8] ^= p8;
|
||||
sc->h[0x9] ^= p9;
|
||||
sc->h[0xA] ^= pA;
|
||||
sc->h[0xB] ^= pB;
|
||||
sc->h[0xC] ^= pC;
|
||||
sc->h[0xD] ^= pD;
|
||||
sc->h[0xE] ^= pE;
|
||||
sc->h[0xF] ^= pF;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
{
|
||||
__m128i p0, p1, p2, p3, x;
|
||||
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m128i *m = (__m128i*)msg;
|
||||
__m128i *h = (__m128i*)sc->h;
|
||||
int r;
|
||||
|
||||
p0 = h[0];
|
||||
p1 = h[1];
|
||||
p2 = h[2];
|
||||
p3 = h[3];
|
||||
|
||||
// round
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k02 = m[2];
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k03 = m[3];
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
k10 = m[4];
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k11 = m[5];
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k12 = m[6];
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k13 = m[7];
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
if ( r == 0 )
|
||||
k00 = _mm_xor_si128( k00, _mm_set_epi32(
|
||||
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
if ( r == 1 )
|
||||
k01 = _mm_xor_si128( k01, _mm_set_epi32(
|
||||
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
if ( r == 2 )
|
||||
k13 = _mm_xor_si128( k13, _mm_set_epi32(
|
||||
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p2, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p0, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, k11 );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
|
||||
k00 = _mm_xor_si128( k00, k13 );
|
||||
|
||||
x = _mm_xor_si128( p0, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
|
||||
k01 = _mm_xor_si128( k01, k00 );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
|
||||
k02 = _mm_xor_si128( k02, k01 );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
|
||||
k03 = _mm_xor_si128( k03, k02 );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p3 = _mm_xor_si128( p3, x );
|
||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
|
||||
k10 = _mm_xor_si128( k10, k03 );
|
||||
|
||||
x = _mm_xor_si128( p2, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
|
||||
k11 = _mm_xor_si128( k11, k10 );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
|
||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
|
||||
k13 = _mm_xor_si128( k13, k12 );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p1 = _mm_xor_si128( p1, x );
|
||||
|
||||
h[0] = _mm_xor_si128( h[0], p2 );
|
||||
h[1] = _mm_xor_si128( h[1], p3 );
|
||||
h[2] = _mm_xor_si128( h[2], p0 );
|
||||
h[3] = _mm_xor_si128( h[3], p1 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void
|
||||
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
|
||||
{
|
||||
memcpy( sc->h, iv, sizeof sc->h );
|
||||
sc->ptr = 0;
|
||||
sc->count0 = 0;
|
||||
sc->count1 = 0;
|
||||
sc->count2 = 0;
|
||||
sc->count3 = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr;
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
while (len > 0) {
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy(buf + ptr, data, clen);
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if (ptr == sizeof sc->buf) {
|
||||
if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
|
||||
sc->count1 = SPH_T32(sc->count1 + 1);
|
||||
if (sc->count1 == 0) {
|
||||
sc->count2 = SPH_T32(sc->count2 + 1);
|
||||
if (sc->count2 == 0) {
|
||||
sc->count3 = SPH_T32(
|
||||
sc->count3 + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
c512(sc, buf);
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr, u;
|
||||
unsigned z;
|
||||
sph_u32 count0, count1, count2, count3;
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
|
||||
count1 = sc->count1;
|
||||
count2 = sc->count2;
|
||||
count3 = sc->count3;
|
||||
z = 0x80 >> n;
|
||||
z = ((ub & -z) | z) & 0xFF;
|
||||
if (ptr == 0 && n == 0) {
|
||||
buf[0] = 0x80;
|
||||
memset(buf + 1, 0, 109);
|
||||
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
|
||||
} else if (ptr < 110) {
|
||||
buf[ptr ++] = z;
|
||||
memset(buf + ptr, 0, 110 - ptr);
|
||||
} else {
|
||||
buf[ptr ++] = z;
|
||||
memset(buf + ptr, 0, 128 - ptr);
|
||||
c512(sc, buf);
|
||||
memset(buf, 0, 110);
|
||||
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
|
||||
}
|
||||
sph_enc32le(buf + 110, count0);
|
||||
sph_enc32le(buf + 114, count1);
|
||||
sph_enc32le(buf + 118, count2);
|
||||
sph_enc32le(buf + 122, count3);
|
||||
buf[126] = (unsigned char) (out_size_w32 << 5);
|
||||
buf[127] = (unsigned char) (out_size_w32 >> 3);
|
||||
c512(sc, buf);
|
||||
for (u = 0; u < out_size_w32; u ++)
|
||||
sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_init(void *cc)
|
||||
{
|
||||
shavite_big_aesni_init(cc, IV512);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shavite_big_aesni_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_close(void *cc, void *dst)
|
||||
{
|
||||
shavite_big_aesni_close(cc, 0, 0, dst, 16);
|
||||
}
|
||||
|
||||
void
|
||||
sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst)
|
||||
{
|
||||
shavite_big_aesni_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_init(void *cc)
|
||||
sph_shavite512_sw_init(void *cc)
|
||||
{
|
||||
shavite_big_init(cc, IV512);
|
||||
}
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512(void *cc, const void *data, size_t len)
|
||||
sph_shavite512_sw(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shavite_big_core(cc, data, len);
|
||||
}
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_close(void *cc, void *dst)
|
||||
sph_shavite512_sw_close(void *cc, void *dst)
|
||||
{
|
||||
shavite_big_close(cc, 0, 0, dst, 16);
|
||||
// shavite_big_init(cc, IV512);
|
||||
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)
|
||||
|
||||
/* see sph_shavite.h */
|
||||
void
|
||||
sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
shavite_big_close(cc, ub, n, dst, 16);
|
||||
// shavite_big_init(cc, IV512);
|
||||
|
@@ -77,9 +77,9 @@ extern "C"{
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
unsigned char buf[64] __attribute__ ((aligned (64)));
|
||||
sph_u32 h[8] __attribute__ ((aligned (32)));
|
||||
size_t ptr;
|
||||
sph_u32 h[8];
|
||||
sph_u32 count0, count1;
|
||||
#endif
|
||||
} sph_shavite_small_context;
|
||||
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
unsigned char buf[128] __attribute__ ((aligned (64)));
|
||||
sph_u32 h[16] __attribute__ ((aligned (32)));;
|
||||
size_t ptr;
|
||||
sph_u32 h[16];
|
||||
sph_u32 count0, count1, count2, count3;
|
||||
#endif
|
||||
} sph_shavite_big_context;
|
||||
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
|
||||
void sph_shavite384_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
/**
|
||||
* Initialize a SHAvite-512 context. This process performs no memory allocation.
|
||||
*
|
||||
* @param cc the SHAvite-512 context (pointer to a
|
||||
* <code>sph_shavite512_context</code>)
|
||||
*/
|
||||
void sph_shavite512_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_shavite512(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current SHAvite-512 computation and output the result into
|
||||
* the provided buffer. The destination buffer must be wide enough to
|
||||
* accomodate the result (64 bytes). The context is automatically
|
||||
* reinitialized.
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_shavite512_close(void *cc, void *dst);
|
||||
|
||||
/**
|
||||
* Add a few additional bits (0 to 7) to the current computation, then
|
||||
* terminate it and output the result in the provided buffer, which must
|
||||
* be wide enough to accomodate the result (64 bytes). If bit number i
|
||||
* in <code>ub</code> has value 2^i, then the extra bits are those
|
||||
* numbered 7 downto 8-n (this is the big-endian convention at the byte
|
||||
* level). The context is automatically reinitialized.
|
||||
*
|
||||
* @param cc the SHAvite-512 context
|
||||
* @param ub the extra bits
|
||||
* @param n the number of extra bits (0 to 7)
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_shavite512_addbits_and_close(
|
||||
// Always define sw but only define aesni when available
|
||||
// Define fptrs for aesni or sw, not both.
|
||||
void sph_shavite512_sw_init(void *cc);
|
||||
void sph_shavite512_sw(void *cc, const void *data, size_t len);
|
||||
void sph_shavite512_sw_close(void *cc, void *dst);
|
||||
void sph_shavite512_sw_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
|
||||
#ifdef __AES__
|
||||
void sph_shavite512_aesni_init(void *cc);
|
||||
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
|
||||
void sph_shavite512_aesni_close(void *cc, void *dst);
|
||||
void sph_shavite512_aesni_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#define sph_shavite512_init sph_shavite512_aesni_init
|
||||
#define sph_shavite512 sph_shavite512_aesni
|
||||
#define sph_shavite512_close sph_shavite512_aesni_close
|
||||
#define sph_shavite512_addbits_and_close \
|
||||
sph_shavite512_aesni_addbits_and_close
|
||||
|
||||
#else
|
||||
|
||||
#define sph_shavite512_init sph_shavite512_sw_init
|
||||
#define sph_shavite512 sph_shavite512_sw
|
||||
#define sph_shavite512_close sph_shavite512_sw_close
|
||||
#define sph_shavite512_addbits_and_close \
|
||||
sph_shavite512_sw_addbits_and_close
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_skein_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
|
||||
#if defined (SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
|
@@ -342,17 +342,6 @@ do { \
|
||||
do { \
|
||||
sph_u64 t0, t1, t2; \
|
||||
__m256i h8; \
|
||||
/* can LE be assumed? \
|
||||
dec64le does nothing when SPH_LITTLE endian is set, as it is. \
|
||||
__m256i m0 = _mm256_dec64le( buf ); \
|
||||
__m256i m1 = _mm256_dec64le( buf + 8*4 ); \
|
||||
__m256i m2 = _mm256_dec64le( buf + 16*4 ); \
|
||||
__m256i m3 = _mm256_dec64le( buf + 24*4 ); \
|
||||
__m256i m4 = _mm256_dec64le( buf + 32*4 ); \
|
||||
__m256i m5 = _mm256_dec64le( buf + 40*4 ); \
|
||||
__m256i m6 = _mm256_dec64le( buf + 48*4 ); \
|
||||
__m256i m7 = _mm256_dec64le( buf + 56*4 ); \
|
||||
*/ \
|
||||
__m256i m0 = buf[0]; \
|
||||
__m256i m1 = buf[1]; \
|
||||
__m256i m2 = buf[2]; \
|
||||
|
@@ -39,7 +39,9 @@
|
||||
*/
|
||||
|
||||
#ifndef __SKEIN_HASH_4WAY_H__
|
||||
#define __SKEIN_HASH_4WAY_H__
|
||||
#define __SKEIN_HASH_4WAY_H__ 1
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -53,14 +55,15 @@ extern "C"{
|
||||
#define SPH_SIZE_skein256 256
|
||||
#define SPH_SIZE_skein512 512
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8] __attribute__ ((aligned (32)));
|
||||
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
|
||||
size_t ptr;
|
||||
sph_u64 bcount;
|
||||
} skein512_4way_context;
|
||||
} sph_skein_4way_big_context;
|
||||
|
||||
typedef sph_skein_4way_big_context skein512_4way_context;
|
||||
typedef sph_skein_4way_big_context skein256_4way_context;
|
||||
|
||||
void skein512_4way_init(void *cc);
|
||||
void skein512_4way(void *cc, const void *data, size_t len);
|
||||
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
|
||||
//void sph_skein512_addbits_and_close(
|
||||
// void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[8] __attribute__ ((aligned (32)));
|
||||
__m128i h0, h1, h2, h3, h4, h5, h6, h7;
|
||||
size_t ptr;
|
||||
sph_u64 bcount;
|
||||
} skein256_4way_context;
|
||||
|
||||
void skein256_4way_init(void *cc);
|
||||
void skein256_4way(void *cc, const void *data, size_t len);
|
||||
void skein256_4way_close(void *cc, void *dst);
|
||||
//void sph_skein256_addbits_and_close(
|
||||
// void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
231
algo/sm3/sm3-hash-4way.c
Normal file
231
algo/sm3/sm3-hash-4way.c
Normal file
@@ -0,0 +1,231 @@
|
||||
/* ====================================================================
|
||||
* Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the GmSSL Project.
|
||||
* (http://gmssl.org/)"
|
||||
*
|
||||
* 4. The name "GmSSL Project" must not be used to endorse or promote
|
||||
* products derived from this software without prior written
|
||||
* permission. For written permission, please contact
|
||||
* guanzhi1980@gmail.com.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "GmSSL"
|
||||
* nor may "GmSSL" appear in their names without prior written
|
||||
* permission of the GmSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the GmSSL Project
|
||||
* (http://gmssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "sm3-hash-4way.h"
|
||||
|
||||
#ifdef __AVX__
|
||||
|
||||
void sm3_4way_init( sm3_4way_ctx_t *ctx )
|
||||
{
|
||||
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
|
||||
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
|
||||
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
|
||||
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
|
||||
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
|
||||
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
|
||||
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
|
||||
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
|
||||
ctx->nblocks = 0;
|
||||
ctx->num = 0;
|
||||
}
|
||||
|
||||
void sm3_4way( void *cc, const void *data, size_t len )
|
||||
{
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
|
||||
if ( ctx->num )
|
||||
{
|
||||
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
|
||||
if ( len < left )
|
||||
{
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
|
||||
ctx->num += len;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
ctx->nblocks++;
|
||||
vdata += left>>2;
|
||||
len -= left;
|
||||
}
|
||||
}
|
||||
while ( len >= SM3_BLOCK_SIZE )
|
||||
{
|
||||
sm3_4way_compress( ctx->digest, vdata );
|
||||
ctx->nblocks++;
|
||||
vdata += SM3_BLOCK_SIZE>>2;
|
||||
len -= SM3_BLOCK_SIZE;
|
||||
}
|
||||
ctx->num = len;
|
||||
if ( len )
|
||||
memcpy_128( block, vdata, len>>2 );
|
||||
}
|
||||
|
||||
void sm3_4way_close( void *cc, void *dst )
|
||||
{
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *hash = (__m128i*)dst;
|
||||
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
int i;
|
||||
|
||||
block[ctx->num] = _mm_set1_epi32( 0x80 );
|
||||
|
||||
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
|
||||
{
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
}
|
||||
|
||||
count[0] = mm_byteswap_32(
|
||||
_mm_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
( ctx->num << 3 ) ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
|
||||
for ( i = 0; i < 8 ; i++ )
|
||||
hash[i] = mm_byteswap_32( ctx->digest[i] );
|
||||
}
|
||||
|
||||
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
|
||||
mm_rotl_32( x, 17 ) ) )
|
||||
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
|
||||
mm_rotl_32( x, 23 ) ) )
|
||||
|
||||
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
|
||||
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
|
||||
_mm_and_si128( x, z ) ), \
|
||||
_mm_and_si128( y, z ) )
|
||||
|
||||
#define GG0(x,y,z) FF0(x,y,z)
|
||||
#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
|
||||
_mm_andnot_si128( x, z ) )
|
||||
|
||||
|
||||
void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
{
|
||||
__m128i W[68], W1[64];
|
||||
__m128i A = digest[ 0 ];
|
||||
__m128i B = digest[ 1 ];
|
||||
__m128i C = digest[ 2 ];
|
||||
__m128i D = digest[ 3 ];
|
||||
__m128i E = digest[ 4 ];
|
||||
__m128i F = digest[ 5 ];
|
||||
__m128i G = digest[ 6 ];
|
||||
__m128i H = digest[ 7 ];
|
||||
__m128i SS1, SS2, TT1, TT2, T;
|
||||
int j;
|
||||
|
||||
for ( j = 0; j < 16; j++ )
|
||||
W[j] = mm_byteswap_32( block[j] );
|
||||
|
||||
for ( j = 16; j < 68; j++ )
|
||||
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
|
||||
W[ j-9 ] ),
|
||||
mm_rotl_32( W[ j-3 ], 15 ) ) ),
|
||||
_mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
|
||||
W[ j-6 ] ) );
|
||||
|
||||
for( j = 0; j < 64; j++ )
|
||||
W1[j] = _mm_xor_si128( W[j], W[j+4] );
|
||||
|
||||
T = _mm_set1_epi32( 0x79CC4519UL );
|
||||
for( j =0; j < 16; j++ )
|
||||
{
|
||||
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
|
||||
mm_rotl_32( T, j ) ), 7 );
|
||||
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
|
||||
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
|
||||
SS2 ), W1[j] );
|
||||
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
|
||||
SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm_rotl_32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm_rotl_32( F, 19 );
|
||||
F = E;
|
||||
E = P0( TT2 );
|
||||
}
|
||||
|
||||
T = _mm_set1_epi32( 0x7A879D8AUL );
|
||||
for( j =16; j < 64; j++ )
|
||||
{
|
||||
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
|
||||
mm_rotl_32( T, j&31 ) ), 7 );
|
||||
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
|
||||
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
|
||||
SS2 ), W1[j] );
|
||||
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
|
||||
SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm_rotl_32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm_rotl_32( F, 19 );
|
||||
F = E;
|
||||
E = P0( TT2 );
|
||||
}
|
||||
|
||||
digest[0] = _mm_xor_si128( digest[0], A );
|
||||
digest[1] = _mm_xor_si128( digest[1], B );
|
||||
digest[2] = _mm_xor_si128( digest[2], C );
|
||||
digest[3] = _mm_xor_si128( digest[3], D );
|
||||
digest[4] = _mm_xor_si128( digest[4], E );
|
||||
digest[5] = _mm_xor_si128( digest[5], F );
|
||||
digest[6] = _mm_xor_si128( digest[6], G );
|
||||
digest[7] = _mm_xor_si128( digest[7], H );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
89
algo/sm3/sm3-hash-4way.h
Normal file
89
algo/sm3/sm3-hash-4way.h
Normal file
@@ -0,0 +1,89 @@
|
||||
/* ====================================================================
|
||||
* Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the GmSSL Project.
|
||||
* (http://gmssl.org/)"
|
||||
*
|
||||
* 4. The name "GmSSL Project" must not be used to endorse or promote
|
||||
* products derived from this software without prior written
|
||||
* permission. For written permission, please contact
|
||||
* guanzhi1980@gmail.com.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "GmSSL"
|
||||
* nor may "GmSSL" appear in their names without prior written
|
||||
* permission of the GmSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the GmSSL Project
|
||||
* (http://gmssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*/
|
||||
|
||||
#ifndef SPH_SM3_HASH_4WAY_H
|
||||
#define SPH_SM3_HASH_4WAY_H
|
||||
|
||||
#define SM3_DIGEST_LENGTH 32
|
||||
#define SM3_BLOCK_SIZE 64
|
||||
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
|
||||
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
|
||||
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct {
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i digest[8];
|
||||
uint32_t nblocks;
|
||||
uint32_t num;
|
||||
} sm3_4way_ctx_t;
|
||||
|
||||
void sm3_4way_init( sm3_4way_ctx_t *ctx );
|
||||
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
|
||||
// size_t data_len );
|
||||
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
|
||||
// unsigned char digest[SM3_DIGEST_LENGTH] );
|
||||
void sm3_4way_compress( __m128i *digest, __m128i *block );
|
||||
|
||||
void sm3_4way(void *cc, const void *data, size_t len);
|
||||
void sm3_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
|
||||
for(j =16; j < 64; j++) {
|
||||
|
||||
T[j] = 0x7A879D8A;
|
||||
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
|
||||
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
|
||||
SS2 = SS1 ^ ROTATELEFT(A,12);
|
||||
TT1 = FF1(A,B,C) + D + SS2 + W1[j];
|
||||
TT2 = GG1(E,F,G) + H + SS1 + W[j];
|
||||
|
@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
|
||||
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
|
||||
#endif
|
||||
|
||||
uint64_t *b= (uint64_t*)sc->buf;
|
||||
uint64_t *s= (uint64_t*)sc->state;
|
||||
//uint64_t *b= (uint64_t*)sc->buf;
|
||||
//uint64_t *s= (uint64_t*)sc->state;
|
||||
// printf("Sptr 1= %u\n",current);
|
||||
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
|
||||
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
|
||||
|
@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
|
||||
for (i = 0; i < 8; i ++) \
|
||||
sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
|
||||
}
|
||||
// sph_ ## name ## _init(cc); \
|
||||
//}
|
||||
|
||||
/*
|
||||
sph_ ## name ## _init(cc); \
|
||||
}
|
||||
*/
|
||||
MAKE_CLOSE(whirlpool)
|
||||
MAKE_CLOSE(whirlpool0)
|
||||
MAKE_CLOSE(whirlpool1)
|
||||
|
@@ -1,4 +1,7 @@
|
||||
#include "whirlpool-gate.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -6,8 +9,6 @@
|
||||
#include "sph_whirlpool.h"
|
||||
#include "whirlpool-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static __thread whirlpool_4way_context whirl_mid;
|
||||
|
||||
void whirlpool_hash_4way( void *state, const void *input )
|
||||
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
|
||||
}
|
||||
|
||||
int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
unsigned long *hashes_done )
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
// if (opt_benchmark)
|
||||
// ((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
@@ -2,14 +2,16 @@
|
||||
|
||||
bool register_whirlpool_algo( algo_gate_t* gate )
|
||||
{
|
||||
//#if defined (WHIRLPOOL_4WAY)
|
||||
// gate->scanhash = (void*)&scanhash_whirlpool_4way;
|
||||
// gate->hash = (void*)&whirlpool_hash_4way;
|
||||
//#else
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_whirlpool_4way;
|
||||
gate->hash = (void*)&whirlpool_hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_whirlpool;
|
||||
gate->hash = (void*)&whirlpool_hash;
|
||||
init_whirlpool_ctx();
|
||||
//#endif
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,21 +4,25 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__)
|
||||
#define WHIRLPOOL_4WAY
|
||||
#endif
|
||||
*/
|
||||
|
||||
//#if defined (WHIRLPOOL_4WAY)
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
|
||||
//void whirlpool_hash_4way(void *state, const void *input);
|
||||
void whirlpool_hash_4way(void *state, const void *input);
|
||||
|
||||
//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
//#endif
|
||||
int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#else
|
||||
|
||||
void whirlpool_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
void init_whirlpool_ctx();
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -3345,8 +3345,12 @@ do { \
|
||||
#define READ_STATE MUL8(READ_STATE_W)
|
||||
#define ROUND0 MUL8(ROUND0_W)
|
||||
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
|
||||
/*
|
||||
#define BYTE(x, n) \
|
||||
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
|
||||
*/
|
||||
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
|
||||
|
||||
|
||||
// A very complex, but structured, expression with a mix of scalar
|
||||
// and vector operations to retrieve specific 64 bit constants from
|
||||
@@ -3357,21 +3361,51 @@ do { \
|
||||
// Extract 64 bit vector elements from "in" representing offsets. Unmask the
|
||||
// low byte of each and scale for use as vector indexes.
|
||||
// Pack the data in a vector and return it.
|
||||
|
||||
/*
|
||||
#define t_row( inv, row ) \
|
||||
_mm256_and_si256( \
|
||||
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
|
||||
|
||||
// Extract vector element from "lane" of vector "in[row]" and use it to index
|
||||
// scalar array of constants "table" and return referenced 64 bit entry.
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
|
||||
*/
|
||||
|
||||
// Build a vector from elements of non-contiguous 64 bit data extracted from
|
||||
// scalar "table".
|
||||
// reference scalar version 1480 kH/s
|
||||
/*
|
||||
// version 1, extract with gather
|
||||
// 955 kH/s
|
||||
#define t_lane( inv, row, lane ) \
|
||||
BYTE( _mm256_extract_epi64( inv, lane ), row ) \
|
||||
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
_mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
|
||||
t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
|
||||
t_lane( inv, row, 0) ), 1 )
|
||||
*/
|
||||
/*
|
||||
// version 2, extract with set
|
||||
// 1100 kH/s
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
*/
|
||||
|
||||
// version 3, vector indexing with set
|
||||
// 1105 kH/s
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ BYTE( inv[ lane ], row ) ] \
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
|
||||
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_WHIRLPOOL
|
||||
|
||||
|
252
algo/x11/c11-4way.c
Normal file
252
algo/x11/c11-4way.c
Normal file
@@ -0,0 +1,252 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "c11-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} c11_4way_ctx_holder;
|
||||
|
||||
c11_4way_ctx_holder c11_4way_ctx;
|
||||
|
||||
void init_c11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &c11_4way_ctx.blake );
|
||||
bmw512_4way_init( &c11_4way_ctx.bmw );
|
||||
init_groestl( &c11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
keccak512_4way_init( &c11_4way_ctx.keccak );
|
||||
init_luffa( &c11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_4way_ctx.shavite );
|
||||
init_sd( &c11_4way_ctx.simd, 512 );
|
||||
init_echo( &c11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void c11_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
c11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
c11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x11/c11-gate.c
Normal file
18
algo/x11/c11-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "c11-gate.h"
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (C11_4WAY)
|
||||
init_c11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||
gate->hash = (void*)&c11_4way_hash;
|
||||
#else
|
||||
init_c11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x11/c11-gate.h
Normal file
32
algo/x11/c11-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef C11_GATE_H__
|
||||
#define C11_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define C11_4WAY
|
||||
#endif
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(C11_4WAY)
|
||||
|
||||
void c11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_c11_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void c11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_c11_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "c11-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -64,7 +64,7 @@ void init_c11_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11hash( void *output, const void *input )
|
||||
void c11_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
@@ -157,12 +157,13 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
be32enc( &endiandata[19], nonce );
|
||||
c11hash( hash, endiandata );
|
||||
c11_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
@@ -171,13 +172,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_c11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
274
algo/x11/timetravel-4way.c
Normal file
274
algo/x11/timetravel-4way.c
Normal file
@@ -0,0 +1,274 @@
|
||||
#include "timetravel-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
} tt8_4way_ctx_holder;
|
||||
|
||||
tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_tt8_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &tt8_4way_ctx.blake );
|
||||
bmw512_4way_init( &tt8_4way_ctx.bmw );
|
||||
init_groestl( &tt8_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &tt8_4way_ctx.skein );
|
||||
jh512_4way_init( &tt8_4way_ctx.jh );
|
||||
keccak512_4way_init( &tt8_4way_ctx.keccak );
|
||||
init_luffa( &tt8_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
|
||||
};
|
||||
|
||||
void timetravel_4way_hash(void *output, const void *input)
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t *vhashA, *vhashB;
|
||||
tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
uint32_t dataLen = 64;
|
||||
int i;
|
||||
|
||||
memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
|
||||
|
||||
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
dataLen = 80;
|
||||
vhashA = (uint64_t*)input;
|
||||
vhashB = vhashX;
|
||||
}
|
||||
else
|
||||
{
|
||||
dataLen = 64;
|
||||
if ( i % 2 == 0 )
|
||||
{
|
||||
vhashA = vhashY;
|
||||
vhashB = vhashX;
|
||||
}
|
||||
else
|
||||
{
|
||||
vhashA = vhashX;
|
||||
vhashB = vhashY;
|
||||
}
|
||||
}
|
||||
|
||||
switch ( permutation[i] )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way_close( &ctx.blake, vhashB );
|
||||
if ( i == 7 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
if ( i == 7 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 2:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, dataLen<<3 );
|
||||
if ( i != 7 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 7 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
if ( i == 7 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhashB );
|
||||
if ( i == 7 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence *)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
if ( i != 7 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 7:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)hash0, dataLen );
|
||||
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
|
||||
(const byte*)hash1, dataLen );
|
||||
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
|
||||
(const byte*)hash2, dataLen );
|
||||
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
|
||||
(const byte*)hash3, dataLen );
|
||||
if ( i != 7 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
default:
|
||||
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
int i;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
{
|
||||
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
|
||||
% TT8_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
timetravel_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
78
algo/x11/timetravel-gate.c
Normal file
78
algo/x11/timetravel-gate.c
Normal file
@@ -0,0 +1,78 @@
|
||||
#include "timetravel-gate.h"
|
||||
|
||||
void tt8_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_timetravel_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef TIMETRAVEL_4WAY
|
||||
init_tt8_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel_4way;
|
||||
gate->hash = (void*)&timetravel_4way_hash;
|
||||
#else
|
||||
init_tt8_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel;
|
||||
gate->hash = (void*)&timetravel_hash;
|
||||
#endif
|
||||
gate->set_target = (void*)&tt8_set_target;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
||||
inline void tt_swap( int *a, int *b )
|
||||
{
|
||||
int c = *a;
|
||||
*a = *b;
|
||||
*b = c;
|
||||
}
|
||||
|
||||
inline void reverse( int *pbegin, int *pend )
|
||||
{
|
||||
while ( (pbegin != pend) && (pbegin != --pend) )
|
||||
{
|
||||
tt_swap( pbegin, pend );
|
||||
pbegin++;
|
||||
}
|
||||
}
|
||||
|
||||
void tt8_next_permutation( int *pbegin, int *pend )
|
||||
{
|
||||
if ( pbegin == pend )
|
||||
return;
|
||||
|
||||
int *i = pbegin;
|
||||
++i;
|
||||
if ( i == pend )
|
||||
return;
|
||||
|
||||
i = pend;
|
||||
--i;
|
||||
|
||||
while (1)
|
||||
{
|
||||
int *j = i;
|
||||
--i;
|
||||
|
||||
if ( *i < *j )
|
||||
{
|
||||
int *k = pend;
|
||||
|
||||
while ( !(*i < *--k) ) /* do nothing */ ;
|
||||
|
||||
tt_swap( i, k );
|
||||
reverse(j, pend);
|
||||
return; // true
|
||||
}
|
||||
|
||||
if ( i == pbegin )
|
||||
{
|
||||
reverse(pbegin, pend);
|
||||
return; // false
|
||||
}
|
||||
// else?
|
||||
}
|
||||
}
|
||||
|
40
algo/x11/timetravel-gate.h
Normal file
40
algo/x11/timetravel-gate.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef TIMETRAVEL_GATE_H__
|
||||
#define TIMETRAVEL_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define TIMETRAVEL_4WAY
|
||||
#endif
|
||||
|
||||
// Machinecoin Genesis Timestamp
|
||||
#define TT8_FUNC_BASE_TIMESTAMP 1389040865
|
||||
|
||||
#define TT8_FUNC_COUNT 8
|
||||
#define TT8_FUNC_COUNT_PERMUTATIONS 40320
|
||||
|
||||
void tt8_next_permutation( int *pbegin, int *pend );
|
||||
|
||||
bool register_timetravel_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(TIMETRAVEL_4WAY)
|
||||
|
||||
void timetravel_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_tt8_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void timetravel_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_tt8_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,11 +1,9 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "timetravel-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
@@ -13,75 +11,14 @@
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
// Machinecoin Genesis Timestamp
|
||||
#define HASH_FUNC_BASE_TIMESTAMP 1389040865
|
||||
|
||||
#define HASH_FUNC_COUNT 8
|
||||
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
|
||||
|
||||
inline void tt_swap( int *a, int *b )
|
||||
{
|
||||
int c = *a;
|
||||
*a = *b;
|
||||
*b = c;
|
||||
}
|
||||
|
||||
inline void reverse( int *pbegin, int *pend )
|
||||
{
|
||||
while ( (pbegin != pend) && (pbegin != --pend) )
|
||||
{
|
||||
tt_swap( pbegin, pend );
|
||||
pbegin++;
|
||||
}
|
||||
}
|
||||
|
||||
static void next_permutation( int *pbegin, int *pend )
|
||||
{
|
||||
if ( pbegin == pend )
|
||||
return;
|
||||
|
||||
int *i = pbegin;
|
||||
++i;
|
||||
if ( i == pend )
|
||||
return;
|
||||
|
||||
i = pend;
|
||||
--i;
|
||||
|
||||
while (1)
|
||||
{
|
||||
int *j = i;
|
||||
--i;
|
||||
|
||||
if ( *i < *j )
|
||||
{
|
||||
int *k = pend;
|
||||
|
||||
while ( !(*i < *--k) ) /* do nothing */ ;
|
||||
|
||||
tt_swap( i, k );
|
||||
reverse(j, pend);
|
||||
return; // true
|
||||
}
|
||||
|
||||
if ( i == pbegin )
|
||||
{
|
||||
reverse(pbegin, pend);
|
||||
return; // false
|
||||
}
|
||||
// else?
|
||||
}
|
||||
}
|
||||
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
|
||||
|
||||
typedef struct {
|
||||
sph_blake512_context blake;
|
||||
@@ -101,7 +38,7 @@ typedef struct {
|
||||
tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
|
||||
__thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));
|
||||
|
||||
void init_tt_ctx()
|
||||
void init_tt8_ctx()
|
||||
{
|
||||
sph_blake512_init( &tt_ctx.blake );
|
||||
sph_bmw512_init( &tt_ctx.bmw );
|
||||
@@ -119,7 +56,7 @@ void init_tt_ctx()
|
||||
|
||||
void timetravel_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
|
||||
uint32_t *hashA, *hashB;
|
||||
tt_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
uint32_t dataLen = 64;
|
||||
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)
|
||||
|
||||
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
|
||||
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
|
||||
memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
|
||||
}
|
||||
|
||||
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
{
|
||||
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
|
||||
% HASH_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
|
||||
% TT8_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
|
||||
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
|
||||
// do midstate precalc for first function
|
||||
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void timetravel_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_timetravel_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_tt_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel;
|
||||
gate->hash = (void*)&timetravel_hash;
|
||||
gate->set_target = (void*)&timetravel_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
316
algo/x11/timetravel10-4way.c
Normal file
316
algo/x11/timetravel10-4way.c
Normal file
@@ -0,0 +1,316 @@
|
||||
#include "timetravel10-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
} tt10_4way_ctx_holder;
|
||||
|
||||
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_tt10_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &tt10_4way_ctx.blake );
|
||||
bmw512_4way_init( &tt10_4way_ctx.bmw );
|
||||
init_groestl( &tt10_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &tt10_4way_ctx.skein );
|
||||
jh512_4way_init( &tt10_4way_ctx.jh );
|
||||
keccak512_4way_init( &tt10_4way_ctx.keccak );
|
||||
init_luffa( &tt10_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &tt10_4way_ctx.shavite );
|
||||
init_sd( &tt10_4way_ctx.simd, 512 );
|
||||
};
|
||||
|
||||
void timetravel10_4way_hash(void *output, const void *input)
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t *vhashA, *vhashB;
|
||||
tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
uint32_t dataLen = 64;
|
||||
int i;
|
||||
|
||||
memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
|
||||
|
||||
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
dataLen = 80;
|
||||
vhashA = (uint64_t*)input;
|
||||
vhashB = vhashX;
|
||||
}
|
||||
else
|
||||
{
|
||||
dataLen = 64;
|
||||
if ( i % 2 == 0 )
|
||||
{
|
||||
vhashA = vhashY;
|
||||
vhashB = vhashX;
|
||||
}
|
||||
else
|
||||
{
|
||||
vhashA = vhashX;
|
||||
vhashB = vhashY;
|
||||
}
|
||||
}
|
||||
|
||||
switch ( permutation[i] )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way_close( &ctx.blake, vhashB );
|
||||
if ( i == 9 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
if ( i == 9 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 2:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, dataLen<<3 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, dataLen<<3 );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 9 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
if ( i == 9 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhashB );
|
||||
if ( i == 9 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence *)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 7:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*)hash0, dataLen );
|
||||
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
|
||||
(const byte*)hash1, dataLen );
|
||||
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
|
||||
(const byte*)hash2, dataLen );
|
||||
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
|
||||
(const byte*)hash3, dataLen );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 8:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
sph_shavite512( &ctx.shavite, hash0, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash1, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash2, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash3, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 9:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhashA, dataLen<<3 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, dataLen<<3 );
|
||||
if ( i != 9 )
|
||||
mm256_interleave_4x64( vhashB,
|
||||
hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
default:
|
||||
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_timetravel10_4way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
int i;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
{
|
||||
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
|
||||
% TT10_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
timetravel10_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
78
algo/x11/timetravel10-gate.c
Normal file
78
algo/x11/timetravel10-gate.c
Normal file
@@ -0,0 +1,78 @@
|
||||
#include "timetravel10-gate.h"
|
||||
|
||||
void tt10_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_timetravel10_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef TIMETRAVEL10_4WAY
|
||||
init_tt10_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel10_4way;
|
||||
gate->hash = (void*)&timetravel10_4way_hash;
|
||||
#else
|
||||
init_tt10_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel10;
|
||||
gate->hash = (void*)&timetravel10_hash;
|
||||
#endif
|
||||
gate->set_target = (void*)&tt10_set_target;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
||||
inline void tt10_swap( int *a, int *b )
|
||||
{
|
||||
int c = *a;
|
||||
*a = *b;
|
||||
*b = c;
|
||||
}
|
||||
|
||||
inline void reverse( int *pbegin, int *pend )
|
||||
{
|
||||
while ( (pbegin != pend) && (pbegin != --pend) )
|
||||
{
|
||||
tt10_swap( pbegin, pend );
|
||||
pbegin++;
|
||||
}
|
||||
}
|
||||
|
||||
void tt10_next_permutation( int *pbegin, int *pend )
|
||||
{
|
||||
if ( pbegin == pend )
|
||||
return;
|
||||
|
||||
int *i = pbegin;
|
||||
++i;
|
||||
if ( i == pend )
|
||||
return;
|
||||
|
||||
i = pend;
|
||||
--i;
|
||||
|
||||
while (1)
|
||||
{
|
||||
int *j = i;
|
||||
--i;
|
||||
|
||||
if ( *i < *j )
|
||||
{
|
||||
int *k = pend;
|
||||
|
||||
while ( !(*i < *--k) ) /* do nothing */ ;
|
||||
|
||||
tt10_swap( i, k );
|
||||
reverse(j, pend);
|
||||
return; // true
|
||||
}
|
||||
|
||||
if ( i == pbegin )
|
||||
{
|
||||
reverse(pbegin, pend);
|
||||
return; // false
|
||||
}
|
||||
// else?
|
||||
}
|
||||
}
|
||||
|
39
algo/x11/timetravel10-gate.h
Normal file
39
algo/x11/timetravel10-gate.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef TIMETRAVEL10_GATE_H__
|
||||
#define TIMETRAVEL10_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define TIMETRAVEL10_4WAY
|
||||
#endif
|
||||
|
||||
// BitCore Genesis Timestamp
|
||||
#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
|
||||
#define TT10_FUNC_COUNT 10
|
||||
#define TT10_FUNC_COUNT_PERMUTATIONS 40320
|
||||
|
||||
void tt10_next_permutation( int *pbegin, int *pend );
|
||||
|
||||
bool register_timetravel10_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(TIMETRAVEL10_4WAY)
|
||||
|
||||
void timetravel10_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_timetravel10_4way( int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done );
|
||||
|
||||
void init_tt10_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void timetravel10_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_tt10_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,11 +1,8 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include "timetravel10-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
@@ -22,68 +19,8 @@
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
// BitCore Genesis Timestamp
|
||||
#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
|
||||
|
||||
#define HASH_FUNC_COUNT 10
|
||||
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
|
||||
|
||||
inline void tt10_swap( int *a, int *b )
|
||||
{
|
||||
int c = *a;
|
||||
*a = *b;
|
||||
*b = c;
|
||||
}
|
||||
|
||||
inline void reverse( int *pbegin, int *pend )
|
||||
{
|
||||
while ( (pbegin != pend) && (pbegin != --pend) )
|
||||
{
|
||||
tt10_swap( pbegin, pend );
|
||||
pbegin++;
|
||||
}
|
||||
}
|
||||
|
||||
static void next_permutation( int *pbegin, int *pend )
|
||||
{
|
||||
if ( pbegin == pend )
|
||||
return;
|
||||
|
||||
int *i = pbegin;
|
||||
++i;
|
||||
if ( i == pend )
|
||||
return;
|
||||
|
||||
i = pend;
|
||||
--i;
|
||||
|
||||
while (1)
|
||||
{
|
||||
int *j = i;
|
||||
--i;
|
||||
|
||||
if ( *i < *j )
|
||||
{
|
||||
int *k = pend;
|
||||
|
||||
while ( !(*i < *--k) ) /* do nothing */ ;
|
||||
|
||||
tt10_swap( i, k );
|
||||
reverse(j, pend);
|
||||
return; // true
|
||||
}
|
||||
|
||||
if ( i == pbegin )
|
||||
{
|
||||
reverse(pbegin, pend);
|
||||
return; // false
|
||||
}
|
||||
// else?
|
||||
}
|
||||
}
|
||||
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
|
||||
|
||||
typedef struct {
|
||||
sph_blake512_context blake;
|
||||
@@ -125,7 +62,7 @@ void init_tt10_ctx()
|
||||
|
||||
void timetravel10_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
|
||||
uint32_t *hashA, *hashB;
|
||||
tt10_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
uint32_t dataLen = 64;
|
||||
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)
|
||||
|
||||
memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );
|
||||
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
|
||||
memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
|
||||
}
|
||||
|
||||
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t timestamp = endiandata[17];
|
||||
if ( timestamp != s_ntime )
|
||||
{
|
||||
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
|
||||
% HASH_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
|
||||
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
|
||||
% TT10_FUNC_COUNT_PERMUTATIONS;
|
||||
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
|
||||
permutation[i] = i;
|
||||
for ( i = 0; i < steps; i++ )
|
||||
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
|
||||
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
|
||||
s_ntime = timestamp;
|
||||
|
||||
// do midstate precalc for first function
|
||||
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void timetravel10_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_timetravel10_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_tt10_ctx();
|
||||
gate->scanhash = (void*)&scanhash_timetravel10;
|
||||
gate->hash = (void*)&timetravel10_hash;
|
||||
gate->set_target = (void*)&timetravel10_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
@@ -10,8 +10,14 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
|
||||
static __thread jh512_4way_context ctx_mid;
|
||||
|
||||
/*
|
||||
void init_tribus_4way_ctx()
|
||||
{
|
||||
init_echo( &tribus_4way_ctx, 512 );
|
||||
}
|
||||
*/
|
||||
void tribus_hash_4way(void *state, const void *input)
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
@@ -1,22 +1,11 @@
|
||||
#include "tribus-gate.h"
|
||||
/*
|
||||
bool tribus_thread_init()
|
||||
{
|
||||
sph_jh512_init( &tribus_ctx.jh );
|
||||
sph_keccak512_init( &tribus_ctx.keccak );
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_init( &tribus_ctx.echo );
|
||||
#else
|
||||
init_echo( &tribus_ctx.echo, 512 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_tribus_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ffff;
|
||||
#if defined (TRIBUS_4WAY)
|
||||
// init_tribus_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_tribus_4way;
|
||||
gate->hash = (void*)&tribus_hash_4way;
|
||||
#else
|
@@ -4,12 +4,14 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define TRIBUS_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(TRIBUS_4WAY)
|
||||
|
||||
//void init_tribus_4way_ctx();
|
||||
|
||||
void tribus_hash_4way( void *state, const void *input );
|
||||
|
||||
int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
252
algo/x11/x11-4way.c
Normal file
252
algo/x11/x11-4way.c
Normal file
@@ -0,0 +1,252 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} x11_4way_ctx_holder;
|
||||
|
||||
x11_4way_ctx_holder x11_4way_ctx;
|
||||
|
||||
void init_x11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11_4way_ctx.blake );
|
||||
bmw512_4way_init( &x11_4way_ctx.bmw );
|
||||
init_groestl( &x11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11_4way_ctx.skein );
|
||||
jh512_4way_init( &x11_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11_4way_ctx.keccak );
|
||||
init_luffa( &x11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_4way_ctx.shavite );
|
||||
init_sd( &x11_4way_ctx.simd, 512 );
|
||||
init_echo( &x11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x11/x11-gate.c
Normal file
18
algo/x11/x11-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x11-gate.h"
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11_4WAY)
|
||||
init_x11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11_4way;
|
||||
gate->hash = (void*)&x11_4way_hash;
|
||||
#else
|
||||
init_x11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x11/x11-gate.h
Normal file
32
algo/x11/x11-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X11_GATE_H__
|
||||
#define X11_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X11_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11_4WAY)
|
||||
|
||||
void x11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -61,7 +61,7 @@ void init_x11_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
static void x11_hash( void *state, const void *input )
|
||||
void x11_hash( void *state, const void *input )
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if ( fulltest( hash64, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_x11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
340
algo/x11/x11evo-4way.c
Normal file
340
algo/x11/x11evo-4way.c
Normal file
@@ -0,0 +1,340 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11evo-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <compat/portable_endian.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} x11evo_4way_ctx_holder;
|
||||
|
||||
static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x11evo_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11evo_4way_ctx.blake );
|
||||
bmw512_4way_init( &x11evo_4way_ctx.bmw );
|
||||
init_groestl( &x11evo_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11evo_4way_ctx.skein );
|
||||
jh512_4way_init( &x11evo_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11evo_4way_ctx.keccak );
|
||||
init_luffa( &x11evo_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11evo_4way_ctx.shavite );
|
||||
init_sd( &x11evo_4way_ctx.simd, 512 );
|
||||
init_echo( &x11evo_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
|
||||
void x11evo_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
|
||||
x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
|
||||
|
||||
if ( s_seq == -1 )
|
||||
{
|
||||
uint32_t *data = (uint32_t*) input;
|
||||
const uint32_t ntime = data[17];
|
||||
evo_twisted_code( ntime, hashOrder );
|
||||
}
|
||||
|
||||
int i;
|
||||
int len = strlen( hashOrder );
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
char elem = hashOrder[i];
|
||||
uint8_t idx;
|
||||
if ( elem >= 'A' )
|
||||
idx = elem - 'A' + 10;
|
||||
else
|
||||
idx = elem - '0';
|
||||
|
||||
// int size = 64;
|
||||
|
||||
switch ( idx )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
if ( i >= len-1 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
break;
|
||||
case 2:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
if ( i >= len-1 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
if ( i >= len-1 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
if ( i >= len-1 )
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
break;
|
||||
case 6:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
|
||||
sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 7:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
|
||||
(const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
|
||||
(const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
|
||||
(const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
|
||||
(const byte*) hash3, 64 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 8:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 9:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 10:
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
|
||||
vhash, 64<<3 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
if ( i < len-1 )
|
||||
mm256_interleave_4x64( vhash,
|
||||
hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
//static const uint32_t diff1targ = 0x0000ffff;
|
||||
|
||||
int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
int ntime = endiandata[17];
|
||||
if ( ntime != s_ntime || s_seq == -1 )
|
||||
{
|
||||
evo_twisted_code( ntime, hashOrder );
|
||||
s_ntime = ntime;
|
||||
}
|
||||
|
||||
uint32_t hmask = 0xFFFFFFFF;
|
||||
if ( Htarg > 0 )
|
||||
{
|
||||
if ( Htarg <= 0xF )
|
||||
hmask = 0xFFFFFFF0;
|
||||
else if ( Htarg <= 0xFF )
|
||||
hmask = 0xFFFFFF00;
|
||||
else if ( Htarg <= 0xFFF )
|
||||
hmask = 0xFFFF000;
|
||||
else if ( Htarg <= 0xFFFF )
|
||||
hmask = 0xFFFF000;
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x11evo_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
95
algo/x11/x11evo-gate.c
Normal file
95
algo/x11/x11evo-gate.c
Normal file
@@ -0,0 +1,95 @@
|
||||
#include "x11evo-gate.h"
|
||||
|
||||
int s_seq = -1;
|
||||
|
||||
static inline int getCurrentAlgoSeq( uint32_t current_time )
|
||||
{
|
||||
// change once per day
|
||||
return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
|
||||
}
|
||||
|
||||
// swap_vars doesn't work here
|
||||
void evo_swap( uint8_t *a, uint8_t *b )
|
||||
{
|
||||
uint8_t __tmp = *a;
|
||||
*a = *b;
|
||||
*b = __tmp;
|
||||
}
|
||||
|
||||
void initPerm( uint8_t n[], uint8_t count )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i<count; i++ )
|
||||
n[i] = i;
|
||||
}
|
||||
|
||||
int nextPerm( uint8_t n[], uint32_t count )
|
||||
{
|
||||
uint32_t tail = 0, i = 0, j = 0;
|
||||
|
||||
if (unlikely( count <= 1 ))
|
||||
return 0;
|
||||
|
||||
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
|
||||
tail = i;
|
||||
|
||||
if ( tail > 0 )
|
||||
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
|
||||
evo_swap( &n[tail - 1], &n[j] );
|
||||
|
||||
for ( i = tail, j = count - 1; i<j; i++, j-- )
|
||||
evo_swap( &n[i], &n[j] );
|
||||
|
||||
return ( tail != 0 );
|
||||
}
|
||||
|
||||
void getAlgoString( char *str, uint32_t count )
|
||||
{
|
||||
uint8_t algoList[X11EVO_FUNC_COUNT];
|
||||
char *sptr;
|
||||
int j;
|
||||
int k;
|
||||
initPerm( algoList, X11EVO_FUNC_COUNT );
|
||||
|
||||
for ( k = 0; k < count; k++ )
|
||||
nextPerm( algoList, X11EVO_FUNC_COUNT );
|
||||
|
||||
sptr = str;
|
||||
for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
|
||||
{
|
||||
if ( algoList[j] >= 10 )
|
||||
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
|
||||
else
|
||||
sprintf( sptr, "%u", algoList[j] );
|
||||
sptr++;
|
||||
}
|
||||
*sptr = 0;
|
||||
|
||||
//applog(LOG_DEBUG, "nextPerm %s", str);
|
||||
}
|
||||
|
||||
void evo_twisted_code( uint32_t ntime, char *permstr )
|
||||
{
|
||||
int seq = getCurrentAlgoSeq( ntime );
|
||||
if ( s_seq != seq )
|
||||
{
|
||||
getAlgoString( permstr, seq );
|
||||
s_seq = seq;
|
||||
}
|
||||
}
|
||||
|
||||
bool register_x11evo_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11EVO_4WAY)
|
||||
init_x11evo_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11evo_4way;
|
||||
gate->hash = (void*)&x11evo_4way_hash;
|
||||
#else
|
||||
init_x11evo_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11evo;
|
||||
gate->hash = (void*)&x11evo_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
39
algo/x11/x11evo-gate.h
Normal file
39
algo/x11/x11evo-gate.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef X11EVO_GATE_H__
|
||||
#define X11EVO_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X11EVO_4WAY
|
||||
#endif
|
||||
|
||||
#define X11EVO_INITIAL_DATE 1462060800
|
||||
#define X11EVO_FUNC_COUNT 11
|
||||
|
||||
extern int s_seq;
|
||||
|
||||
bool register_x11evo_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11EVO_4WAY)
|
||||
|
||||
void x11evo_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11evo_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x11evo_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11evo_ctx();
|
||||
|
||||
void evo_twisted_code( uint32_t ntime, char *permstr );
|
||||
|
||||
#endif
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include "x11evo-gate.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -26,9 +26,6 @@
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
|
||||
#define INITIAL_DATE 1462060800
|
||||
#define HASH_FUNC_COUNT 11
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
|
||||
sph_shavite512_init( &x11evo_ctx.shavite );
|
||||
}
|
||||
|
||||
/*
|
||||
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
|
||||
{
|
||||
return (current_time - base_time) / (60 * 60 * 24);
|
||||
}
|
||||
*/
|
||||
|
||||
static inline int getCurrentAlgoSeq( uint32_t current_time )
|
||||
{
|
||||
// change once per day
|
||||
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
|
||||
}
|
||||
|
||||
// swap_vars doesn't work here
|
||||
void evo_swap( uint8_t *a, uint8_t *b )
|
||||
{
|
||||
uint8_t __tmp = *a;
|
||||
*a = *b;
|
||||
*b = __tmp;
|
||||
}
|
||||
|
||||
void initPerm( uint8_t n[], uint8_t count )
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i<count; i++ )
|
||||
n[i] = i;
|
||||
}
|
||||
|
||||
int nextPerm( uint8_t n[], uint32_t count )
|
||||
{
|
||||
uint32_t tail = 0, i = 0, j = 0;
|
||||
|
||||
if (unlikely( count <= 1 ))
|
||||
return 0;
|
||||
|
||||
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
|
||||
tail = i;
|
||||
|
||||
if ( tail > 0 )
|
||||
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
|
||||
evo_swap( &n[tail - 1], &n[j] );
|
||||
|
||||
for ( i = tail, j = count - 1; i<j; i++, j-- )
|
||||
evo_swap( &n[i], &n[j] );
|
||||
|
||||
return ( tail != 0 );
|
||||
}
|
||||
|
||||
void getAlgoString( char *str, uint32_t count )
|
||||
{
|
||||
uint8_t algoList[HASH_FUNC_COUNT];
|
||||
char *sptr;
|
||||
int j;
|
||||
int k;
|
||||
initPerm( algoList, HASH_FUNC_COUNT );
|
||||
|
||||
for ( k = 0; k < count; k++ )
|
||||
nextPerm( algoList, HASH_FUNC_COUNT );
|
||||
|
||||
sptr = str;
|
||||
for ( j = 0; j < HASH_FUNC_COUNT; j++ )
|
||||
{
|
||||
if ( algoList[j] >= 10 )
|
||||
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
|
||||
else
|
||||
sprintf( sptr, "%u", algoList[j] );
|
||||
sptr++;
|
||||
}
|
||||
*sptr = 0;
|
||||
|
||||
//applog(LOG_DEBUG, "nextPerm %s", str);
|
||||
}
|
||||
|
||||
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
|
||||
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static int s_seq = -1;
|
||||
|
||||
static void evo_twisted_code(uint32_t ntime, char *permstr)
|
||||
{
|
||||
int seq = getCurrentAlgoSeq(ntime);
|
||||
if (s_seq != seq)
|
||||
{
|
||||
getAlgoString(permstr, seq);
|
||||
s_seq = seq;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void x11evo_hash( void *state, const void *input )
|
||||
void x11evo_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
static const uint32_t diff1targ = 0x0000ffff;
|
||||
//static const uint32_t diff1targ = 0x0000ffff;
|
||||
|
||||
int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
unsigned long *hashes_done )
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
||||
@@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
else if ( Htarg <= 0xFFF )
|
||||
hmask = 0xFFFF000;
|
||||
else if ( Htarg <= 0xFFFF )
|
||||
hmask = 0xFFFF000;
|
||||
hmask = 0xFFFF000;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
pdata[19] = ++n;
|
||||
be32enc( &endiandata[19], n );
|
||||
x11evo_hash( hash64, &endiandata );
|
||||
x11evo_hash( hash64, endiandata );
|
||||
if ( ( hash64[7] & hmask ) == 0 )
|
||||
{
|
||||
if ( fulltest( hash64, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_x11evo_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_x11evo;
|
||||
gate->hash = (void*)&x11evo_hash;
|
||||
init_x11evo_ctx();
|
||||
return true;
|
||||
};
|
||||
|
||||
|
259
algo/x11/x11gost-4way.c
Normal file
259
algo/x11/x11gost-4way.c
Normal file
@@ -0,0 +1,259 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} x11gost_4way_ctx_holder;
|
||||
|
||||
x11gost_4way_ctx_holder x11gost_4way_ctx;
|
||||
|
||||
void init_x11gost_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11gost_4way_ctx.blake );
|
||||
bmw512_4way_init( &x11gost_4way_ctx.bmw );
|
||||
init_groestl( &x11gost_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11gost_4way_ctx.skein );
|
||||
jh512_4way_init( &x11gost_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11gost_4way_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_4way_ctx.gost );
|
||||
init_luffa( &x11gost_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11gost_4way_ctx.shavite );
|
||||
init_sd( &x11gost_4way_ctx.simd, 512 );
|
||||
init_echo( &x11gost_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11gost_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x11gost_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
|
||||
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x11gost_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x11/x11gost-gate.c
Normal file
18
algo/x11/x11gost-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11GOST_4WAY)
|
||||
init_x11gost_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
||||
gate->hash = (void*)&x11gost_4way_hash;
|
||||
#else
|
||||
init_x11gost_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x11/x11gost-gate.h
Normal file
32
algo/x11/x11gost-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X11GOST_GATE_H__
|
||||
#define X11GOST_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X11GOST_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11GOST_4WAY)
|
||||
|
||||
void x11gost_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11gost_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x11gost_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11gost_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -37,28 +37,28 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
} sib_ctx_holder;
|
||||
} x11gost_ctx_holder;
|
||||
|
||||
sib_ctx_holder sib_ctx;
|
||||
x11gost_ctx_holder x11gost_ctx;
|
||||
|
||||
void init_sib_ctx()
|
||||
void init_x11gost_ctx()
|
||||
{
|
||||
sph_gost512_init(&sib_ctx.gost);
|
||||
sph_shavite512_init(&sib_ctx.shavite);
|
||||
init_luffa( &sib_ctx.luffa, 512 );
|
||||
cubehashInit( &sib_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &sib_ctx.simd, 512 );
|
||||
sph_gost512_init( &x11gost_ctx.gost );
|
||||
sph_shavite512_init( &x11gost_ctx.shavite );
|
||||
init_luffa( &x11gost_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &x11gost_ctx.simd, 512 );
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &sib_ctx.groestl );
|
||||
sph_echo512_init( &sib_ctx.echo );
|
||||
sph_groestl512_init( &x11gost_ctx.groestl );
|
||||
sph_echo512_init( &x11gost_ctx.echo );
|
||||
#else
|
||||
init_echo( &sib_ctx.echo, 512 );
|
||||
init_groestl( &sib_ctx.groestl, 64 );
|
||||
init_echo( &x11gost_ctx.echo, 512 );
|
||||
init_groestl( &x11gost_ctx.groestl, 64 );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void sibhash(void *output, const void *input)
|
||||
void x11gost_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (64)));
|
||||
#define hashA hash
|
||||
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
|
||||
sib_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
|
||||
x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
|
||||
memcpy(output, hashA, 32);
|
||||
}
|
||||
|
||||
int scanhash_sib(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -156,11 +156,12 @@ int scanhash_sib(int thr_id, struct work *work,
|
||||
do {
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
sibhash(hash, endiandata);
|
||||
x11gost_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
@@ -172,12 +173,3 @@ int scanhash_sib(int thr_id, struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_sib_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_sib_ctx();
|
||||
gate->scanhash = (void*)&scanhash_sib;
|
||||
gate->hash = (void*)&sibhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
||||
|
186
algo/x13/phi1612-4way.c
Normal file
186
algo/x13/phi1612-4way.c
Normal file
@@ -0,0 +1,186 @@
|
||||
#include "x13-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
cubehashParam cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
hashState_echo echo;
|
||||
} phi1612_4way_ctx_holder;
|
||||
|
||||
phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_phi1612_4way_ctx()
|
||||
{
|
||||
skein512_4way_init( &phi1612_4way_ctx.skein );
|
||||
jh512_4way_init( &phi1612_4way_ctx.jh );
|
||||
cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &phi1612_4way_ctx.fugue );
|
||||
sph_gost512_init( &phi1612_4way_ctx.gost );
|
||||
init_echo( &phi1612_4way_ctx.echo, 512 );
|
||||
};
|
||||
|
||||
void phi1612_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
phi1612_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
|
||||
|
||||
// Skein parallel 4way
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// Gost
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
// Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0cff;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
phi1612_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x13/phi1612-gate.c
Normal file
18
algo/x13/phi1612-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "phi1612-gate.h"
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(PHI1612_4WAY)
|
||||
init_phi1612_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612_4way;
|
||||
gate->hash = (void*)&phi1612_4way_hash;
|
||||
#else
|
||||
init_phi1612_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612;
|
||||
gate->hash = (void*)&phi1612_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user