Compare commits

...

5 Commits

Author SHA1 Message Date
Jay D Dee
a90d75b8f5 v3.7.10 2018-01-16 15:11:44 -05:00
Jay D Dee
bee78eac76 v3.7.9 2018-01-08 22:04:43 -05:00
Jay D Dee
2d2e54f001 v3.7.8 2017-12-30 19:19:46 -05:00
Jay D Dee
79164c24b5 v3.7.7 2017-12-17 12:00:42 -05:00
Jay D Dee
7a1389998b v3.7.6 2017-12-14 18:28:51 -05:00
150 changed files with 10987 additions and 2547 deletions

View File

@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
algo/argon2/ar2/cores.c \
algo/argon2/ar2/ar2-scrypt-jane.c \
algo/argon2/ar2/blake2b.c \
algo/axiom.c \
algo/blake/sph_blake.c \
algo/blake/blake-hash-4way.c \
algo/blake/blake-gate.c \
@@ -47,8 +46,10 @@ cpuminer_SOURCES = \
algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \
algo/blake/blake2s.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \
algo/blake/blakecoin-4way.c \
algo/blake/decred-gate.c \
algo/blake/decred.c \
algo/blake/decred-4way.c \
@@ -56,6 +57,7 @@ cpuminer_SOURCES = \
algo/blake/pentablake-4way.c \
algo/blake/pentablake.c \
algo/bmw/sph_bmw.c \
algo/bmw/bmw-hash-4way.c \
algo/bmw/bmw256.c \
algo/cryptonight/cryptolight.c \
algo/cryptonight/cryptonight-common.c\
@@ -63,10 +65,8 @@ cpuminer_SOURCES = \
algo/cryptonight/cryptonight.c\
algo/cubehash/sph_cubehash.c \
algo/cubehash/sse2/cubehash_sse2.c\
algo/drop.c \
algo/echo/sph_echo.c \
algo/echo/aes_ni/hash.c\
algo/fresh.c \
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \
algo/groestl/groestl.c \
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
algo/heavy/sph_hefty1.c \
algo/heavy/heavy.c \
algo/heavy/bastion.c \
algo/hmq1725.c \
algo/hodl/aes.c \
algo/hodl/hodl-gate.c \
algo/hodl/hodl-wolf.c \
@@ -102,21 +101,27 @@ cpuminer_SOURCES = \
algo/luffa/sse2/luffa_for_sse2.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2rev2-4way.c \
algo/lyra2/lyra2re.c \
algo/lyra2/lyra2z-gate.c \
algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/m7m.c \
algo/neoscrypt.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
algo/nist5/zr5.c \
algo/pluck.c \
algo/polytimos/polytimos-gate.c \
algo/polytimos/polytimos.c \
algo/quark/quark-gate.c \
algo/quark/quark.c \
algo/quark/quark-4way.c \
algo/qubit/qubit.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
@@ -127,7 +132,9 @@ cpuminer_SOURCES = \
algo/sha/sha2.c \
algo/sha/sha256t.c \
algo/shabal/sph_shabal.c \
algo/shabal/shabal-hash-4way.c \
algo/shavite/sph_shavite.c \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \
algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \
@@ -136,41 +143,77 @@ cpuminer_SOURCES = \
algo/skein/skein-hash-4way.c \
algo/skein/skein.c \
algo/skein/skein-4way.c \
algo/skein/skein-gate.c \
algo/skein/skein-gate.c \
algo/skein/skein2.c \
algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \
algo/skunk.c \
algo/sm3/sm3.c \
algo/sm3/sm3-hash-4way.c \
algo/tiger/sph_tiger.c \
algo/timetravel.c \
algo/timetravel10.c \
algo/tribus/tribus-gate.c \
algo/tribus/tribus.c \
algo/tribus/tribus-4way.c \
algo/veltor.c \
algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \
algo/whirlpool/whirlpool-4way.c \
algo/whirlpool/whirlpool.c \
algo/whirlpool/whirlpoolx.c \
algo/x11/phi1612.c \
algo/x11/x11-gate.c \
algo/x11/x11.c \
algo/x11/x11evo.c \
algo/x11/x11-4way.c \
algo/x11/x11gost-gate.c \
algo/x11/x11gost.c \
algo/x11/x11gost-4way.c \
algo/x11/c11-gate.c \
algo/x11/c11.c \
algo/x11/c11-4way.c \
algo/x11/tribus-gate.c \
algo/x11/tribus.c \
algo/x11/tribus-4way.c \
algo/x11/timetravel-gate.c \
algo/x11/timetravel.c \
algo/x11/timetravel-4way.c \
algo/x11/timetravel10-gate.c \
algo/x11/timetravel10.c \
algo/x11/timetravel10-4way.c \
algo/x11/fresh.c \
algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \
algo/x13/x13sm3-gate.c \
algo/x13/x13sm3.c \
algo/x13/x13sm3-4way.c \
algo/x13/phi1612-gate.c \
algo/x13/phi1612.c \
algo/x13/phi1612-4way.c \
algo/x13/skunk-gate.c \
algo/x13/skunk-4way.c \
algo/x13/skunk.c \
algo/x13/drop.c \
algo/x14/x14-gate.c \
algo/x14/x14.c \
algo/x14/x14-4way.c \
algo/x14/veltor-gate.c \
algo/x14/veltor.c \
algo/x14/veltor-4way.c \
algo/x14/polytimos-gate.c \
algo/x14/polytimos.c \
algo/x14/polytimos-4way.c \
algo/x14/axiom.c \
algo/x15/x15-gate.c \
algo/x15/x15.c \
algo/x15/x15-4way.c \
algo/x17/x17-gate.c \
algo/x17/x17.c \
algo/xevan.c \
algo/x17/x17-4way.c \
algo/x17/xevan-gate.c \
algo/x17/xevan.c \
algo/x17/xevan-4way.c \
algo/x17/hmq1725.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c\
algo/yescrypt/yescrypt-simd.c\
algo/zr5.c
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-simd.c
disable_flags =

View File

@@ -40,6 +40,7 @@ Supported Algorithms
keccakc Creative coin
lbry LBC, LBRY Credits
luffa Luffa
lyra2h Hppcoin
lyra2re lyra2
lyra2rev2 lyra2v2, Vertcoin
lyra2z Zcoin (XZC)
@@ -67,7 +68,7 @@ Supported Algorithms
timetravel10 Bitcore
tribus Denarius (DNR)
vanilla blake256r8vnl (VCash)
veltor
veltor (VLT)
whirlpool
whirlpoolx
x11 Dash
@@ -80,6 +81,7 @@ Supported Algorithms
x17
xevan Bitsend
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)\n\
yescryptr16 Yenten (YTN)
zr5 Ziftr
@@ -95,13 +97,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
may work wallet mining but there are no guarantees.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Errata
------

View File

@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Exe name Compile opts Arch name
Exe name Compile flags Arch name
cpuminer-sse2.exe -march=core2 Core2
cpuminer-sse42.exe -march=corei7 Nehalem
cpuminer-aes-sse42.exe -maes -msse4.2" Westmere
cpuminer-aes-avx.exe -march=corei7-avx" Sandybridge, Ivybridge
cpuminer-aes-avx2.exe "-march=core-avx2" Haswell, Broadwell, Skylake, Kabylake
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY"
cpuminer-sse2.exe "-march=core2" Core2
cpuminer-sse42.exe "-march=corei7" Nehalem
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen...
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2
cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
4way requires a CPU with AES and AVX2. It is still under development and
only a few algos are supported. See change log in RELEASE_NOTES in source
package for supported algos.
There is no binary support available for SHA on AMD Ryzen CPUs.
Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
is provided. Four way still uses AVX2.

View File

@@ -27,8 +27,9 @@ Compile Instructions
Requirements:
Intel Core2 or newer, or AMD Steamroller or newer CPU.
64 bit Linux or Windows operating system.
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple is not supported.
Building on linux prerequisites:
@@ -164,6 +165,35 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.7.10
4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
x11evo, blakecoin.
Faster x13sm3 (hsr).
Added share difficulty to accepted message.
v3.7.9
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
Additional 4way optimizations for X algos.
New algo yescryptr8 for BitZeny, not to be confused with original
yescrypt Globalboost-Y.
v3.7.8
Partial 4way optimization for most X algos including c11, xevan, phi, hsr
v3.7.7
Fixed regression caused by 64 CPU support.
Fixed lyra2h.
v3.7.6
Added lyra2h algo for Hppcoin.
Added support for more than 64 CPUs.
Optimized shavite512 with AES, improves x11 etc.
v3.7.5
New algo keccakc for Creative coin with 4way optimizations
@@ -171,7 +201,7 @@ New algo keccakc for Creative coin with 4way optimizations
Rewrote some AVX/AVX2 code for more consistent implementation and some
optimizing.
Enhanced capabilities check to support 4way, mor eprecise reporting of
Enhanced capabilities check to support 4way, more precise reporting of
features (not all algos use SSE2), and better error messages when using
an incompatible pre-built version (Windows users).

View File

@@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
gate->work_cmp_size = STD_WORK_CMP_SIZE;
}
// Ignore warnings for not yet defined register functions
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
// called by each thread that uses the gate
bool register_algo_gate( int algo, algo_gate_t *gate )
{
@@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo)
{
// Ignore warnings for not yet defined register fucntions
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break;
@@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
case ALGO_LBRY: register_lbry_algo ( gate ); break;
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break;
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_sib_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
@@ -219,12 +219,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X17: register_x17_algo ( gate ); break;
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
// restore warnings
#pragma GCC diagnostic pop
default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
return false;
@@ -239,6 +236,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
return true;
}
// restore warnings
#pragma GCC diagnostic pop
// override std defaults with jr2 defaults
bool register_json_rpc2( algo_gate_t *gate )
{
@@ -279,6 +279,7 @@ const char* const algo_alias_map[][2] =
{
// alias proper
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },
{ "blake256r8", "blakecoin" },
{ "blake256r8vnl", "vanilla" },
{ "blake256r14", "blake" },
@@ -301,10 +302,9 @@ const char* const algo_alias_map[][2] =
// { "sia", "blake2b" },
{ "sib", "x11gost" },
{ "timetravel8", "timetravel" },
{ "yes", "yescrypt" },
{ "ziftr", "zr5" },
{ "yenten", "yescryptr16" },
{ "yescryptr8", "yescrypt" },
{ "yescryptr8k", "yescrypt" },
{ "zcoin", "lyra2z" },
{ "zoin", "lyra2z330" },
{ NULL, NULL }

View File

@@ -1,107 +1,90 @@
#include "blake-gate.h"
#include "sph_blake.h"
#if defined (__AVX__)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash1, 32 );
memcpy( state+96, hash1, 32 );
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
memcpy( &ctx, &blake_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) endiandata[20];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// if (opt_benchmark)
// HTarget = 0x7f;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( endiandata, pdata, 20 );
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
endiandata, 640 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r14_4way_init( &blake_ctx );
blake256r14_4way( &blake_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +2, n+1 );
be32enc( noncep +4, n+2 );
be32enc( noncep +6, n+3 );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
blakehash_4way( hash, vdata );
if ( hash[7] == 0 )
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] == 0 )
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] == 0 )
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] == 0 )
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
*hashes_done = n - first_nonce + 1;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );

View File

@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
four_way_not_tested();
#else
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;

View File

@@ -36,7 +36,6 @@
#include <string.h>
#include <limits.h>
//#include "sph_blake.h"
#include "blake-hash-4way.h"
#ifdef __cplusplus
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
/*
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3
11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4
7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8
9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13
2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9
12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11
13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10
6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5
10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0
*/
#endif
#define Z00 0
@@ -504,14 +491,9 @@ do { \
(state)->T1 = T1; \
} while (0)
//#define BLAKE32_ROUNDS 8
#ifndef BLAKE32_ROUNDS
#define BLAKE32_ROUNDS 14
#endif
#if SPH_COMPACT_BLAKE_32
#define COMPRESS32_4WAY do { \
#define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -524,18 +506,18 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
_mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
_mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
@@ -552,7 +534,7 @@ do { \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -576,80 +558,70 @@ do { \
// current impl
#define COMPRESS32_4WAY do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#define COMPRESS32_4WAY( rounds ) \
do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#endif
@@ -710,18 +682,18 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
_mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
_mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
@@ -845,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
static void
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt)
const sph_u32 *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
}
static void
@@ -867,7 +840,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
@@ -892,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY;
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
}
}
@@ -915,48 +887,44 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
unsigned z = 0x80 >> n;
unsigned zz = ((ub & -z) | z) & 0xFF;
u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 48 )
if ( ptr <= 52 )
{
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
_mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
@@ -974,9 +942,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
sc->H[i] = _mm256_set1_epi64x( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
sc->S[i] = _mm256_set1_epi64x( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
@@ -1048,12 +1016,12 @@ blake64_4way_close( blake_4way_big_context *sc,
th = sc->T1;
if (ptr == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
sc->T1 = SPH_T64(sc->T1 - 1);
}
else
@@ -1065,10 +1033,7 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 ) );
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1081,15 +1046,11 @@ blake64_4way_close( blake_4way_big_context *sc,
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 );
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1104,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,
#endif
// default 14 rounds, backward copatibility
void
blake256_4way_init(void *cc)
{
blake32_4way_init(cc, IV256, salt_zero_small);
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
}
void
@@ -1119,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
void
blake256_4way_close(void *cc, void *dst)
{
blake256_4way_addbits_and_close(cc, 0, 0, dst);
blake32_4way_close(cc, 0, 0, dst, 8);
}
// 14 rounds blake, decred
void blake256r14_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
}
void
blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
blake256r14_4way(void *cc, const void *data, size_t len)
{
blake32_4way_close(cc, ub, n, dst, 8);
blake32_4way(cc, data, len);
}
void
blake256r14_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
// 8 rounds blakecoin, vanilla
void blake256r8_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 8 );
}
void
blake256r8_4way(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
void
blake256r8_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
#if defined (__AVX2__)

View File

@@ -35,7 +35,9 @@
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY___
#define __BLAKE_HASH_4WAY__
#ifdef __AVX__
#ifdef __cplusplus
extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BLAKE-256.
*/
#define SPH_SIZE_blake256 256
#if SPH_64
/**
* Output size (in bits) for BLAKE-512.
*/
#define SPH_SIZE_blake512 512
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context;
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst);
void blake256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
// 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way(void *cc, const void *data, size_t len);
void blake256r14_4way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_4way_small_context blake256r8_4way_context;
void blake256r8_4way_init(void *cc);
void blake256r8_4way(void *cc, const void *data, size_t len);
void blake256r8_4way_close(void *cc, void *dst);
#ifdef __AVX2__
@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
#endif
#endif
#endif

106
algo/blake/blakecoin-4way.c Normal file
View File

@@ -0,0 +1,106 @@
#include "blakecoin-gate.h"
#if defined (__AVX__)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
blake256r8_4way_context blakecoin_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r8_4way_init( &blakecoin_ctx );
blake256r8_4way( &blakecoin_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
blakecoin_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
// workaround to prevent flood of hash reports when nonce range exhasuted
// and thread is spinning waiting for new work
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
sleep(1);
}
return num_found;
}
#endif

View File

@@ -0,0 +1,71 @@
#include "blakecoin-gate.h"
#include <memory.h>
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
// return 0x3fffffLL;
}
// Blakecoin 4 way hashes so fast it runs out of nonces.
// This is an attempt to solve this but the result may be
// to rehash old nonces until new work is received.
void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
//
// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
// algo_gate.stratum_gen_work( &stratum, g_work );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|| ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) && clean_job )
/*
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
*/
{
work_free( work );
work_copy( work, g_work );
*nonceptr = 0xffffffffU / opt_n_threads * thr_id;
if ( opt_randomize )
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
// try incrementing the xnonce to chsnge the data
// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
}
else
++(*nonceptr);
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_4WAY)
// four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
// gate->get_new_work = (void*)&bc4w_get_new_work;
// blakecoin_4way_init( &blake_4way_init_ctx );
#else
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
// blakecoin_init( &blake_init_ctx );
#endif
gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}

View File

@@ -0,0 +1,21 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#define BLAKECOIN_4WAY
#endif
#if defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void blakecoinhash( void *state, const void *input );
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "blakecoin-gate.h"
#define BLAKE32_ROUNDS 8
#include "sph_blake.h"
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
}
*/
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}
*/

View File

@@ -1,5 +1,4 @@
#include "decred-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
@@ -9,110 +8,58 @@
#if defined (DECRED_4WAY)
static __thread blake256_4way_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN;
int tail_len = 180 - DECRED_MIDSTATE_LEN;
// #define MIDSTATE_LEN 128
/*
uint8_t *ending = (uint8_t*) input;
ending += MIDSTATE_LEN;
if ( !ctx_midstate_done )
{
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
ctx_midstate_done = true;
}
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
*/
sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash );
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
printf(" hash mismatch, i = %u\n",i);
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
*(hash+2), *(hash+3) );
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
*(hash0+2), *(hash0+3) );
printf("\n");
*/
// memcpy( state, hash0, 32 );
// memcpy( state+32, hash1, 32 );
// memcpy( state+64, hash1, 32 );
// memcpy( state+96, hash1, 32 );
memcpy( state, hash, 32 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
uint32_t hash[4*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) endiandata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) edata[48];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// #define DCR_NONCE_OFT32 35
ctx_midstate_done = false;
// memcpy(endiandata, pdata, 180);
// copy to buffer guaranteed to be aligned.
memcpy( edata, pdata, 180 );
// use the old way until new way updated for size.
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {
found[0] = found[1] = found[2] = found[3] = false;
* noncep = n;
*(noncep+2) = n+1;
*(noncep+4) = n+2;
*(noncep+6) = n+3;
*(noncep+1) = n+1;
*(noncep+2) = n+2;
*(noncep+3) = n+3;
decred_hash_4way( hash, vdata );
// endiandata[DCR_NONCE_OFT32] = n;
// decred_hash(hash32, endiandata);
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
@@ -121,28 +68,28 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n;
}
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
work_set_target_ratio( work, hash+8 );
found[1] = true;
num_found++;
nonces[1] = n;
nonces[1] = n+1;
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
work_set_target_ratio( work, hash+16 );
found[2] = true;
num_found++;
nonces[2] = n;
nonces[2] = n+2;
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
work_set_target_ratio( work, hash+24 );
found[3] = true;
num_found++;
nonces[3] = n;
nonces[3] = n+3;
}
*/
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );

View File

@@ -1,4 +1,7 @@
#include "pentablake-gate.h"
#ifdef __AVX2__
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -9,8 +12,6 @@
//#define DEBUG_ALGO
#ifdef PENTABLAKE_4WAY
extern void pentablakehash_4way( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[128];

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#if defined(FOUR_WAY) && defined(__AVX2__)
#define PENTABLAKE_4WAY
#endif

View File

@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
} else {
sc->T0 -= 512 - bit_len;
}
if (bit_len <= 446) {
memset(u.buf + ptr + 1, 0, 55 - ptr);
if (out_size_w32 == 8)

1146
algo/bmw/bmw-hash-4way.c Normal file

File diff suppressed because it is too large Load Diff

95
algo/bmw/bmw-hash-4way.h Normal file
View File

@@ -0,0 +1,95 @@
/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
/**
* BMW interface. BMW (aka "Blue Midnight Wish") is a family of
* functions which differ by their output size; this implementation
* defines BMW for output sizes 224, 256, 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_bmw.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef BMW_HASH_H__
#define BMW_HASH_H__
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#ifdef __AVX2__
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#define SPH_SIZE_bmw256 256
#define SPH_SIZE_bmw512 512
typedef struct {
__m128i buf[64];
__m128i H[16];
size_t ptr;
sph_u32 bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
typedef bmw_4way_small_context bmw256_4way_context;
typedef struct {
__m256i buf[16];
__m256i H[16];
size_t ptr;
sph_u64 bit_count;
} bmw_4way_big_context;
typedef bmw_4way_big_context bmw512_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void bmw512_4way_init(void *cc);
void bmw512_4way(void *cc, const void *data, size_t len);
void bmw512_4way_close(void *cc, void *dst);
void bmw512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -96,34 +96,18 @@ extern "C"{
do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
x2 = _mm256_xor_si256( x2, tmp ); \
} while (0)
/*
#define Sb(x0, x1, x2, x3, c) do { \
x3 = ~x3; \
x0 ^= (c) & ~x2; \
tmp = (c) ^ (x0 & x1); \
x0 ^= x2 & x3; \
x3 ^= ~x1 & x2; \
x1 ^= x0 & x2; \
x2 ^= x0 & ~x3; \
x0 ^= x1 | x3; \
x3 ^= x1 & x2; \
x1 ^= tmp & x0; \
x2 ^= tmp; \
} while (0)
*/
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
do { \
x4 = _mm256_xor_si256( x4, x1 ); \
@@ -136,20 +120,6 @@ do { \
x3 = _mm256_xor_si256( x3, x4 ); \
} while (0)
/*
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \
x4 ^= x1; \
x5 ^= x2; \
x6 ^= x3 ^ x0; \
x7 ^= x0; \
x0 ^= x5; \
x1 ^= x6; \
x2 ^= x7 ^ x4; \
x3 ^= x4; \
} while (0)
*/
#if SPH_JH_64
static const sph_u64 C[] = {

View File

@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask0, mask1;
__m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1;
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
@@ -40,122 +40,69 @@ void jha_hash_4way( void *out, const void *input )
keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
// memset_zero_256( vh0, 20 );
// memset_zero_256( vh1, 20 );
// positive logic, if maski select vhi
// going from bit to mask reverses logic such that if the test bit is set
// zero will be put in mask0, meaning don't take vh0. mask1 is
// inverted so 1 will be put in mask1 meaning take it.
mask0 = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
mask1 = mm256_not( mask0 );
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) v skein
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 );
(char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1,
(char*)hash1, 512 );
(char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2,
(char*)hash2, 512 );
(char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash1 );
skein512_4way_close( &ctx_skein, vhashB );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
// blake v jh
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhash0 );
blake512_4way_close( &ctx_blake, vhashA );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash1 );
jh512_4way_close( &ctx_jh, vhashB );
// merge hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
}
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint64_t htmax[] = {
uint64_t htmax[] = {
0,
0xF,
0xFF,
@@ -163,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
@@ -172,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0
};
// we need bigendian data...
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );
// keccak512_4way( &jha_kec_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
@@ -196,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep3, n+3 );
jha_hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
@@ -234,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}

128
algo/lyra2/lyra2h-4way.c Normal file
View File

@@ -0,0 +1,128 @@
#include "lyra2h-gate.h"
#ifdef LYRA2H_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
__thread uint64_t* lyra2h_4way_matrix;
bool lyra2h_4way_thread_init()
{
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
lyra2h_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2h_4way_hash( hash, vdata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

25
algo/lyra2/lyra2h-gate.c Normal file
View File

@@ -0,0 +1,25 @@
#include "lyra2h-gate.h"
#include "lyra2.h"
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
#ifdef LYRA2H_4WAY
gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h_4way;
gate->hash = (void*)&lyra2h_4way_hash;
#else
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

32
algo/lyra2/lyra2h-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef LYRA2H_GATE_H__
#define LYRA2H_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#define LYRA2H_4WAY
#endif
#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8
#if defined(LYRA2H_4WAY)
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_4way_thread_init();
#endif
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_thread_init();
#endif

75
algo/lyra2/lyra2h.c Normal file
View File

@@ -0,0 +1,75 @@
#include "lyra2h-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
__thread uint64_t* lyra2h_matrix;
bool lyra2h_thread_init()
{
lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
return lyra2h_matrix;
}
static __thread sph_blake256_context lyra2h_blake_mid;
void lyra2h_midstate( const void* input )
{
sph_blake256_init( &lyra2h_blake_mid );
sph_blake256( &lyra2h_blake_mid, input, 64 );
}
void lyra2h_hash( void *state, const void *input )
{
uint32_t _ALIGN(64) hash[16];
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
sph_blake256( &ctx_blake, input + 64, 16 );
sph_blake256_close( &ctx_blake, hash );
LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 16, 16, 16 );
memcpy(state, hash, 32);
}
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
lyra2h_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2h_hash( hash, endiandata );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
}

177
algo/lyra2/lyra2rev2-4way.c Normal file
View File

@@ -0,0 +1,177 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#ifdef __AVX2__
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
sph_bmw256_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
void init_lyra2rev2_4way_ctx()
{
// blake256_4way_init( &l2v2_4way_ctx.blake );
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
sph_bmw256_init( &l2v2_4way_ctx.bmw );
}
void lyra2rev2_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
sph_bmw256( &ctx.bmw, hash0, 32 );
sph_bmw256_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash1, 32 );
sph_bmw256_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash2, 32 );
sph_bmw256_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash3, 32 );
sph_bmw256_close( &ctx.bmw, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,38 @@
#include "lyra2rev2-gate.h"
__thread uint64_t* l2v2_wholeMatrix;
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
init_lyra2rev2_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -0,0 +1,35 @@
#ifndef LYRA2REV2_GATE_H__
#define LYRA2REV2_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(HASH_4WAY)
#define LYRA2REV2_4WAY
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_4way_ctx();
#endif
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_ctx();
#endif

View File

@@ -1,20 +1,12 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#include "algo-gate-api.h"
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h"
#include "avxdefs.h"
// This gets allocated when miner_thread starts up and is never freed.
// It's not a leak because the only way to allocate it again is to exit
// the thread and that only occurs when the entire program exits.
__thread uint64_t* l2v2_wholeMatrix;
//#include "lyra2.h"
typedef struct {
cubehashParam cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
return 0;
}
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -4,13 +4,10 @@
#include <memory.h>
#include <mm_malloc.h>
//#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
//#include "avxdefs.h"
// same size, only difference is the name, lyra2 is done serially
__thread uint64_t* lyra2z_4way_matrix;
bool lyra2z_4way_thread_init()
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
blake256_4way( &l2z_4way_blake_mid, input, 64 );
}
// block 2050 new algo, blake plus new lyra parms. new input
// is power of 2 so normal lyra can be used
//void zcoin_hash(void *state, const void *input, uint32_t height)
void lyra2z_4way_hash( void *state, const void *input )
{
// uint32_t _ALIGN(64) hash[16];
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
// blake256_4way( &ctx_blake, input + (64*4), 16 );
// blake256_4way_close( &ctx_blake, vhash );
blake256_4way_init( &ctx_blake );
blake256_4way( &ctx_blake, input, 80 );
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy(state, hash, 32);
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
// uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
// lyra2z_4way_midstate( vdata );
lyra2z_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
@@ -99,46 +85,38 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
printf("found 0\n");
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
printf("found 1\n");
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
*/
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
printf("found 2\n");
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
/*
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
printf("found 3\n");
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
*/
n += 2;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
@@ -148,21 +126,3 @@ printf("found 3\n");
#endif
/*
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
*/

View File

@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
bool register_lyra2z_algo( algo_gate_t* gate )
{
#ifdef LYRA2Z_4WAY
four_way_not_tested();
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
gate->hash = (void*)&lyra2z_4way_hash;
#else
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
return true;

View File

@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
return lyra2z_wholeMatrix;
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1); \
s1 = mm256_rotr256_1x64( s1); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 ); \
s3 = mm256_rotl256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1 ); \
s1 = mm256_rotl256_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 );
s3 = mm256_rotr256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \

View File

@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
hash_str,
target_str);
}
work_set_target_ratio( work, hash );
pdata[19] = data[19];
goto out;
}

View File

@@ -2,7 +2,7 @@
bool register_nist5_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
#if defined (NIST5_4WAY)
gate->scanhash = (void*)&scanhash_nist5_4way;
gate->hash = (void*)&nist5hash_4way;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#if defined(HASH_4WAY) && defined(__AES__)
#define NIST5_4WAY
#endif

View File

@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
pdata[0] = tmpdata[0];
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
work_set_target_ratio( work, hash );
if (opt_debug)
applog(LOG_INFO, "found nonce %x", nonce);
return 1;

View File

@@ -1,12 +0,0 @@
#ifndef __POLYTIMOS_GATE_H__
#define __POLYTIMOS_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
void polytimos_hash( void *state, const void *input );
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_polytimos_context();
#endif

207
algo/quark/quark-4way.c Normal file
View File

@@ -0,0 +1,207 @@
#include "cpuminer-config.h"
#include "quark-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} quark_4way_ctx_holder;
quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
void init_quark_4way_ctx()
{
blake512_4way_init( &quark_4way_ctx.blake );
bmw512_4way_init( &quark_4way_ctx.bmw );
init_groestl( &quark_4way_ctx.groestl, 64 );
skein512_4way_init( &quark_4way_ctx.skein );
jh512_4way_init( &quark_4way_ctx.jh );
keccak512_4way_init( &quark_4way_ctx.keccak );
}
void quark_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
quark_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash );
}
if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash );
}
if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/quark/quark-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "quark-gate.h"
bool register_quark_algo( algo_gate_t* gate )
{
#if defined (QUARK_4WAY)
init_quark_4way_ctx();
gate->scanhash = (void*)&scanhash_quark_4way;
gate->hash = (void*)&quark_4way_hash;
#else
init_quark_ctx();
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

32
algo/quark/quark-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUARK_GATE_H__
#define QUARK_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define QUARK_4WAY
#endif
bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "quark-gate.h"
#include <stdio.h>
#include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
#endif
}
inline static void quarkhash(void *state, const void *input)
void quark_hash(void *state, const void *input)
{
unsigned char hashbuf[128];
size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
quarkhash(hash64, &endiandata);
quark_hash(hash64, &endiandata);
if ((hash64[7]&0xFFFFFF00)==0)
{
if (fulltest(hash64, ptarget))
{
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_quark_algo( algo_gate_t* gate )
{
init_quark_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quarkhash;
return true;
};

View File

@@ -1,31 +1,20 @@
#include "algo-gate-api.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/shavite/sph_shavite.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
#endif
typedef struct
{
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
#ifdef NO_AES_NI
sph_echo512_context echo;
#else
@@ -133,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -1,23 +1,16 @@
#include "algo-gate-api.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/shavite/sph_shavite.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
#endif
typedef struct
@@ -141,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[i * 20 + 19];
work_set_target_ratio( work, hash );
return 1;
}
}

View File

@@ -114,7 +114,7 @@ available_implementations() {
return flags;
}
#endif
/*
static int
scrypt_test_mix() {
static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {
return ret;
}
*/

View File

@@ -26,7 +26,7 @@
#include "scrypt-jane-pbkdf2.h"
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
/*
static int
scrypt_test_hash() {
scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
scrypt_hash_finish(&st, final);
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
}
*/

View File

@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input, uint32_t len)
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
SHA256_Update( &ctx_sha256, input + midlen, tail );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
SHA256_Update( &ctx_sha256, hashA, 32 );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
SHA256_Update( &ctx_sha256, hashA, 32 );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
#else
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

View File

@@ -0,0 +1,618 @@
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
/*
* Shabal implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#ifdef __AVX2__
#include "shabal-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
/*
* Part of this code was automatically generated (the part between
* the "BEGIN" and "END" markers).
*/
#define sM 16
#define C32 SPH_C32
#define T32 SPH_T32
#define O1 13
#define O2 9
#define O3 6
/*
* We copy the state into local variables, so that the compiler knows
* that it can optimize them at will.
*/
/* BEGIN -- automatically generated code. */
#define DECL_STATE \
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
C8, C9, CA, CB, CC, CD, CE, CF; \
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 Wlow, Whigh;
#define READ_STATE(state) do { \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
B3 = (state)->B[3]; \
B4 = (state)->B[4]; \
B5 = (state)->B[5]; \
B6 = (state)->B[6]; \
B7 = (state)->B[7]; \
B8 = (state)->B[8]; \
B9 = (state)->B[9]; \
BA = (state)->B[10]; \
BB = (state)->B[11]; \
BC = (state)->B[12]; \
BD = (state)->B[13]; \
BE = (state)->B[14]; \
BF = (state)->B[15]; \
C0 = (state)->C[0]; \
C1 = (state)->C[1]; \
C2 = (state)->C[2]; \
C3 = (state)->C[3]; \
C4 = (state)->C[4]; \
C5 = (state)->C[5]; \
C6 = (state)->C[6]; \
C7 = (state)->C[7]; \
C8 = (state)->C[8]; \
C9 = (state)->C[9]; \
CA = (state)->C[10]; \
CB = (state)->C[11]; \
CC = (state)->C[12]; \
CD = (state)->C[13]; \
CE = (state)->C[14]; \
CF = (state)->C[15]; \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
} while (0)
#define WRITE_STATE(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
(state)->B[3] = B3; \
(state)->B[4] = B4; \
(state)->B[5] = B5; \
(state)->B[6] = B6; \
(state)->B[7] = B7; \
(state)->B[8] = B8; \
(state)->B[9] = B9; \
(state)->B[10] = BA; \
(state)->B[11] = BB; \
(state)->B[12] = BC; \
(state)->B[13] = BD; \
(state)->B[14] = BE; \
(state)->B[15] = BF; \
(state)->C[0] = C0; \
(state)->C[1] = C1; \
(state)->C[2] = C2; \
(state)->C[3] = C3; \
(state)->C[4] = C4; \
(state)->C[5] = C5; \
(state)->C[6] = C6; \
(state)->C[7] = C7; \
(state)->C[8] = C8; \
(state)->C[9] = C9; \
(state)->C[10] = CA; \
(state)->C[11] = CB; \
(state)->C[12] = CC; \
(state)->C[13] = CD; \
(state)->C[14] = CE; \
(state)->C[15] = CF; \
(state)->Wlow = Wlow; \
(state)->Whigh = Whigh; \
} while (0)
#define DECODE_BLOCK \
do { \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
} while (0)
#define INPUT_BLOCK_ADD \
do { \
B0 = _mm_add_epi32( B0, M0 );\
B1 = _mm_add_epi32( B1, M1 );\
B2 = _mm_add_epi32( B2, M2 );\
B3 = _mm_add_epi32( B3, M3 );\
B4 = _mm_add_epi32( B4, M4 );\
B5 = _mm_add_epi32( B5, M5 );\
B6 = _mm_add_epi32( B6, M6 );\
B7 = _mm_add_epi32( B7, M7 );\
B8 = _mm_add_epi32( B8, M8 );\
B9 = _mm_add_epi32( B9, M9 );\
BA = _mm_add_epi32( BA, MA );\
BB = _mm_add_epi32( BB, MB );\
BC = _mm_add_epi32( BC, MC );\
BD = _mm_add_epi32( BD, MD );\
BE = _mm_add_epi32( BE, ME );\
BF = _mm_add_epi32( BF, MF );\
} while (0)
#define INPUT_BLOCK_SUB \
do { \
C0 = _mm_sub_epi32( C0, M0 ); \
C1 = _mm_sub_epi32( C1, M1 ); \
C2 = _mm_sub_epi32( C2, M2 ); \
C3 = _mm_sub_epi32( C3, M3 ); \
C4 = _mm_sub_epi32( C4, M4 ); \
C5 = _mm_sub_epi32( C5, M5 ); \
C6 = _mm_sub_epi32( C6, M6 ); \
C7 = _mm_sub_epi32( C7, M7 ); \
C8 = _mm_sub_epi32( C8, M8 ); \
C9 = _mm_sub_epi32( C9, M9 ); \
CA = _mm_sub_epi32( CA, MA ); \
CB = _mm_sub_epi32( CB, MB ); \
CC = _mm_sub_epi32( CC, MC ); \
CD = _mm_sub_epi32( CD, MD ); \
CE = _mm_sub_epi32( CE, ME ); \
CF = _mm_sub_epi32( CF, MF ); \
} while (0)
#define XOR_W \
do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
} while (0)
/*
#define SWAP(v1, v2) do { \
sph_u32 tmp = (v1); \
(v1) = (v2); \
(v2) = tmp; \
} while (0)
*/
#define SWAP_BC \
do { \
mm_swap_128( B0, C0 ); \
mm_swap_128( B1, C1 ); \
mm_swap_128( B2, C2 ); \
mm_swap_128( B3, C3 ); \
mm_swap_128( B4, C4 ); \
mm_swap_128( B5, C5 ); \
mm_swap_128( B6, C6 ); \
mm_swap_128( B7, C7 ); \
mm_swap_128( B8, C8 ); \
mm_swap_128( B9, C9 ); \
mm_swap_128( BA, CA ); \
mm_swap_128( BB, CB ); \
mm_swap_128( BC, CC ); \
mm_swap_128( BD, CD ); \
mm_swap_128( BE, CE ); \
mm_swap_128( BF, CF ); \
} while (0)
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
_mm_andnot_si128( xb3, xb2 ), \
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
_mm_mullo_epi32( mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
) ), _mm_set1_epi32(3UL) ) ) ) ); \
xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P \
do { \
B0 = mm_rotr_32( B0, 15 ); \
B1 = mm_rotr_32( B1, 15 ); \
B2 = mm_rotr_32( B2, 15 ); \
B3 = mm_rotr_32( B3, 15 ); \
B4 = mm_rotr_32( B4, 15 ); \
B5 = mm_rotr_32( B5, 15 ); \
B6 = mm_rotr_32( B6, 15 ); \
B7 = mm_rotr_32( B7, 15 ); \
B8 = mm_rotr_32( B8, 15 ); \
B9 = mm_rotr_32( B9, 15 ); \
BA = mm_rotr_32( BA, 15 ); \
BB = mm_rotr_32( BB, 15 ); \
BC = mm_rotr_32( BC, 15 ); \
BD = mm_rotr_32( BD, 15 ); \
BE = mm_rotr_32( BE, 15 ); \
BF = mm_rotr_32( BF, 15 ); \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = _mm_add_epi32( A0B, C6 ); \
A0A = _mm_add_epi32( A0A, C5 ); \
A09 = _mm_add_epi32( A09, C4 ); \
A08 = _mm_add_epi32( A08, C3 ); \
A07 = _mm_add_epi32( A07, C2 ); \
A06 = _mm_add_epi32( A06, C1 ); \
A05 = _mm_add_epi32( A05, C0 ); \
A04 = _mm_add_epi32( A04, CF ); \
A03 = _mm_add_epi32( A03, CE ); \
A02 = _mm_add_epi32( A02, CD ); \
A01 = _mm_add_epi32( A01, CC ); \
A00 = _mm_add_epi32( A00, CB ); \
A0B = _mm_add_epi32( A0B, CA ); \
A0A = _mm_add_epi32( A0A, C9 ); \
A09 = _mm_add_epi32( A09, C8 ); \
A08 = _mm_add_epi32( A08, C7 ); \
A07 = _mm_add_epi32( A07, C6 ); \
A06 = _mm_add_epi32( A06, C5 ); \
A05 = _mm_add_epi32( A05, C4 ); \
A04 = _mm_add_epi32( A04, C3 ); \
A03 = _mm_add_epi32( A03, C2 ); \
A02 = _mm_add_epi32( A02, C1 ); \
A01 = _mm_add_epi32( A01, C0 ); \
A00 = _mm_add_epi32( A00, CF ); \
A0B = _mm_add_epi32( A0B, CE ); \
A0A = _mm_add_epi32( A0A, CD ); \
A09 = _mm_add_epi32( A09, CC ); \
A08 = _mm_add_epi32( A08, CB ); \
A07 = _mm_add_epi32( A07, CA ); \
A06 = _mm_add_epi32( A06, C9 ); \
A05 = _mm_add_epi32( A05, C8 ); \
A04 = _mm_add_epi32( A04, C7 ); \
A03 = _mm_add_epi32( A03, C6 ); \
A02 = _mm_add_epi32( A02, C5 ); \
A01 = _mm_add_epi32( A01, C4 ); \
A00 = _mm_add_epi32( A00, C3 ); \
} while (0)
#define INCR_W do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
} while (0)
static const sph_u32 A_init_256[] = {
C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
};
static const sph_u32 B_init_256[] = {
C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
};
static const sph_u32 C_init_256[] = {
C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
};
static const sph_u32 A_init_512[] = {
C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
};
static const sph_u32 B_init_512[] = {
C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
};
static const sph_u32 C_init_512[] = {
C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
};
static void
shabal_4way_init( void *cc, unsigned size )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
int i;
if ( size == 512 )
{
for ( i = 0; i < 12; i++ )
sc->A[i] = _mm_set1_epi32( A_init_512[i] );
for ( i = 0; i < 16; i++ )
{
sc->B[i] = _mm_set1_epi32( B_init_512[i] );
sc->C[i] = _mm_set1_epi32( C_init_512[i] );
}
}
else
{
for ( i = 0; i < 12; i++ )
sc->A[i] = _mm_set1_epi32( A_init_256[i] );
for ( i = 0; i < 16; i++ )
{
sc->B[i] = _mm_set1_epi32( B_init_256[i] );
sc->C[i] = _mm_set1_epi32( C_init_256[i] );
}
}
sc->Wlow = 1;
sc->Whigh = 0;
sc->ptr = 0;
}
static void
shabal_4way_core( void *cc, const unsigned char *data, size_t len )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
__m128i *buf;
__m128i *vdata = (__m128i*)data;
const int buf_size = 64;
size_t ptr;
DECL_STATE
buf = sc->buf;
ptr = sc->ptr;
if ( len < (buf_size - ptr ) )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += clen>>2;
len -= clen;
if ( ptr == buf_size )
{
DECODE_BLOCK;
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
INPUT_BLOCK_SUB;
SWAP_BC;
INCR_W;
ptr = 0;
}
}
WRITE_STATE(sc);
sc->ptr = ptr;
}
static void
shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
__m128i *buf;
const int buf_size = 64;
size_t ptr;
int i;
unsigned z, zz;
DECL_STATE
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>2] = _mm_set1_epi32( zz );
memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
READ_STATE(sc);
DECODE_BLOCK;
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for ( i = 0; i < 3; i ++ )
{
SWAP_BC;
XOR_W;
APPLY_P;
}
__m128i *d = (__m128i*)dst;
if ( size_words == 16 ) // 512
{
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
}
else // 256
{
d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
}
}
void
shabal256_4way_init( void *cc )
{
shabal_4way_init(cc, 256);
}
void
shabal256_4way( void *cc, const void *data, size_t len )
{
shabal_4way_core( cc, data, len );
}
void
shabal256_4way_close( void *cc, void *dst )
{
shabal_4way_close(cc, 0, 0, dst, 8);
}
void
shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst )
{
shabal_4way_close(cc, ub, n, dst, 8);
}
void
shabal512_4way_init(void *cc)
{
shabal_4way_init(cc, 512);
}
void
shabal512_4way(void *cc, const void *data, size_t len)
{
shabal_4way_core(cc, data, len);
}
void
shabal512_4way_close(void *cc, void *dst)
{
shabal_4way_close(cc, 0, 0, dst, 16);
}
void
shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shabal_4way_close(cc, ub, n, dst, 16);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,82 @@
/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
/**
* Shabal interface. Shabal is a family of functions which differ by
* their output size; this implementation defines Shabal for output
* sizes 192, 224, 256, 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_shabal.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SHABAL_HASH_4WAY_H__
#define SHABAL_HASH_4WAY_H__ 1
#ifdef __AVX2__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#ifdef __cplusplus
extern "C"{
#endif
#define SPH_SIZE_shabal256 256
#define SPH_SIZE_shabal512 512
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i A[12], B[16], C[16];
sph_u32 Whigh, Wlow;
size_t ptr;
} shabal_4way_context;
typedef shabal_4way_context shabal256_4way_context;
typedef shabal_4way_context shabal512_4way_context;
void shabal256_4way_init( void *cc );
void shabal256_4way( void *cc, const void *data, size_t len );
void shabal256_4way_close( void *cc, void *dst );
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_4way_init( void *cc );
void shabal512_4way( void *cc, const void *data, size_t len );
void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
#ifdef __cplusplus
}
#endif
#endif
#endif

View File

@@ -0,0 +1,670 @@
/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
/*
* SHAvite-3 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stdio.h>
#include <stddef.h>
#include <string.h>
#ifdef __AES__
#include "sph_shavite.h"
#include "avxdefs.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
#define SPH_SMALL_FOOTPRINT_SHAVITE 1
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#define C32 SPH_C32
/*
* As of round 2 of the SHA-3 competition, the published reference
* implementation and test vectors are wrong, because they use
* big-endian AES tables while the internal decoding uses little-endian.
* The code below follows the specification. To turn it into a code
* which follows the reference implementation (the one called "BugFix"
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
* of the AES_ROUND_NOKEY macro) and replace it with the version which
* is commented out afterwards.
*/
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
static const sph_u32 IV512[] = {
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
sph_u32 t2 = (x2); \
sph_u32 t3 = (x3); \
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
} while (0)
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
sph_u32 kt; \
AES_ROUND_NOKEY(k1, k2, k3, k0); \
kt = (k0); \
(k0) = (k1); \
(k1) = (k2); \
(k2) = (k3); \
(k3) = kt; \
} while (0)
#if SPH_SMALL_FOOTPRINT_SHAVITE
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512(sph_shavite_big_context *sc, const void *msg)
{
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
sph_u32 rk[448];
size_t u;
int r, s;
#if SPH_LITTLE_ENDIAN
memcpy(rk, msg, 128);
#else
for (u = 0; u < 32; u += 4) {
rk[u + 0] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 0);
rk[u + 1] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 4);
rk[u + 2] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 8);
rk[u + 3] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 12);
}
#endif
u = 32;
for (;;) {
for (s = 0; s < 4; s ++) {
sph_u32 x0, x1, x2, x3;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 32) {
rk[ 32] ^= sc->count0;
rk[ 33] ^= sc->count1;
rk[ 34] ^= sc->count2;
rk[ 35] ^= SPH_T32(~sc->count3);
} else if (u == 440) {
rk[440] ^= sc->count1;
rk[441] ^= sc->count0;
rk[442] ^= sc->count3;
rk[443] ^= SPH_T32(~sc->count2);
}
u += 4;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 164) {
rk[164] ^= sc->count3;
rk[165] ^= sc->count2;
rk[166] ^= sc->count1;
rk[167] ^= SPH_T32(~sc->count0);
} else if (u == 316) {
rk[316] ^= sc->count2;
rk[317] ^= sc->count3;
rk[318] ^= sc->count0;
rk[319] ^= SPH_T32(~sc->count1);
}
u += 4;
}
if (u == 448)
break;
for (s = 0; s < 8; s ++) {
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
u += 4;
}
}
p0 = sc->h[0x0];
p1 = sc->h[0x1];
p2 = sc->h[0x2];
p3 = sc->h[0x3];
p4 = sc->h[0x4];
p5 = sc->h[0x5];
p6 = sc->h[0x6];
p7 = sc->h[0x7];
p8 = sc->h[0x8];
p9 = sc->h[0x9];
pA = sc->h[0xA];
pB = sc->h[0xB];
pC = sc->h[0xC];
pD = sc->h[0xD];
pE = sc->h[0xE];
pF = sc->h[0xF];
u = 0;
for (r = 0; r < 14; r ++) {
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
sph_u32 x0, x1, x2, x3; \
x0 = r0 ^ rk[u ++]; \
x1 = r1 ^ rk[u ++]; \
x2 = r2 ^ rk[u ++]; \
x3 = r3 ^ rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
l0 ^= x0; \
l1 ^= x1; \
l2 ^= x2; \
l3 ^= x3; \
} while (0)
#define WROT(a, b, c, d) do { \
sph_u32 t = d; \
d = c; \
c = b; \
b = a; \
a = t; \
} while (0)
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
WROT(p0, p4, p8, pC);
WROT(p1, p5, p9, pD);
WROT(p2, p6, pA, pE);
WROT(p3, p7, pB, pF);
#undef C512_ELT
#undef WROT
}
sc->h[0x0] ^= p0;
sc->h[0x1] ^= p1;
sc->h[0x2] ^= p2;
sc->h[0x3] ^= p3;
sc->h[0x4] ^= p4;
sc->h[0x5] ^= p5;
sc->h[0x6] ^= p6;
sc->h[0x7] ^= p7;
sc->h[0x8] ^= p8;
sc->h[0x9] ^= p9;
sc->h[0xA] ^= pA;
sc->h[0xB] ^= pB;
sc->h[0xC] ^= pC;
sc->h[0xD] ^= pD;
sc->h[0xE] ^= pE;
sc->h[0xF] ^= pF;
}
#else
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
__m128i p0, p1, p2, p3, x;
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
__m128i *m = (__m128i*)msg;
__m128i *h = (__m128i*)sc->h;
int r;
p0 = h[0];
p1 = h[1];
p2 = h[2];
p3 = h[3];
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
k00 = _mm_xor_si128( k00, _mm_set_epi32(
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
k01 = _mm_xor_si128( k01, _mm_set_epi32(
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
k13 = _mm_xor_si128( k13, _mm_set_epi32(
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
}
// round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );
h[1] = _mm_xor_si128( h[1], p3 );
h[2] = _mm_xor_si128( h[2], p0 );
h[3] = _mm_xor_si128( h[3], p1 );
}
#endif
static void
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
{
memcpy( sc->h, iv, sizeof sc->h );
sc->ptr = 0;
sc->count0 = 0;
sc->count1 = 0;
sc->count2 = 0;
sc->count3 = 0;
}
static void
shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
size_t len )
{
unsigned char *buf;
size_t ptr;
buf = sc->buf;
ptr = sc->ptr;
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen;
if (ptr == sizeof sc->buf) {
if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
sc->count1 = SPH_T32(sc->count1 + 1);
if (sc->count1 == 0) {
sc->count2 = SPH_T32(sc->count2 + 1);
if (sc->count2 == 0) {
sc->count3 = SPH_T32(
sc->count3 + 1);
}
}
}
c512(sc, buf);
ptr = 0;
}
}
sc->ptr = ptr;
}
static void
shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
unsigned char *buf;
size_t ptr, u;
unsigned z;
sph_u32 count0, count1, count2, count3;
buf = sc->buf;
ptr = sc->ptr;
count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
count1 = sc->count1;
count2 = sc->count2;
count3 = sc->count3;
z = 0x80 >> n;
z = ((ub & -z) | z) & 0xFF;
if (ptr == 0 && n == 0) {
buf[0] = 0x80;
memset(buf + 1, 0, 109);
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
} else if (ptr < 110) {
buf[ptr ++] = z;
memset(buf + ptr, 0, 110 - ptr);
} else {
buf[ptr ++] = z;
memset(buf + ptr, 0, 128 - ptr);
c512(sc, buf);
memset(buf, 0, 110);
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
}
sph_enc32le(buf + 110, count0);
sph_enc32le(buf + 114, count1);
sph_enc32le(buf + 118, count2);
sph_enc32le(buf + 122, count3);
buf[126] = (unsigned char) (out_size_w32 << 5);
buf[127] = (unsigned char) (out_size_w32 >> 3);
c512(sc, buf);
for (u = 0; u < out_size_w32; u ++)
sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
}
void
sph_shavite512_aesni_init(void *cc)
{
shavite_big_aesni_init(cc, IV512);
}
void
sph_shavite512_aesni(void *cc, const void *data, size_t len)
{
shavite_big_aesni_core(cc, data, len);
}
void
sph_shavite512_aesni_close(void *cc, void *dst)
{
shavite_big_aesni_close(cc, 0, 0, dst, 16);
}
void
sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst)
{
shavite_big_aesni_close(cc, ub, n, dst, 16);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
/* see sph_shavite.h */
void
sph_shavite512_init(void *cc)
sph_shavite512_sw_init(void *cc)
{
shavite_big_init(cc, IV512);
}
/* see sph_shavite.h */
void
sph_shavite512(void *cc, const void *data, size_t len)
sph_shavite512_sw(void *cc, const void *data, size_t len)
{
shavite_big_core(cc, data, len);
}
/* see sph_shavite.h */
void
sph_shavite512_close(void *cc, void *dst)
sph_shavite512_sw_close(void *cc, void *dst)
{
shavite_big_close(cc, 0, 0, dst, 16);
// shavite_big_init(cc, IV512);
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)
/* see sph_shavite.h */
void
sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shavite_big_close(cc, ub, n, dst, 16);
// shavite_big_init(cc, IV512);

View File

@@ -77,9 +77,9 @@ extern "C"{
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
unsigned char buf[64] __attribute__ ((aligned (64)));
sph_u32 h[8] __attribute__ ((aligned (32)));
size_t ptr;
sph_u32 h[8];
sph_u32 count0, count1;
#endif
} sph_shavite_small_context;
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[128]; /* first field, for alignment */
unsigned char buf[128] __attribute__ ((aligned (64)));
sph_u32 h[16] __attribute__ ((aligned (32)));;
size_t ptr;
sph_u32 h[16];
sph_u32 count0, count1, count2, count3;
#endif
} sph_shavite_big_context;
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
/**
* Initialize a SHAvite-512 context. This process performs no memory allocation.
*
* @param cc the SHAvite-512 context (pointer to a
* <code>sph_shavite512_context</code>)
*/
void sph_shavite512_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHAvite-512 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_shavite512(void *cc, const void *data, size_t len);
/**
* Terminate the current SHAvite-512 computation and output the result into
* the provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHAvite-512 context
* @param dst the destination buffer
*/
void sph_shavite512_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (64 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHAvite-512 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_shavite512_addbits_and_close(
// Always define sw but only define aesni when available
// Define fptrs for aesni or sw, not both.
void sph_shavite512_sw_init(void *cc);
void sph_shavite512_sw(void *cc, const void *data, size_t len);
void sph_shavite512_sw_close(void *cc, void *dst);
void sph_shavite512_sw_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#ifdef __AES__
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
void sph_shavite512_aesni_close(void *cc, void *dst);
void sph_shavite512_aesni_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define sph_shavite512_init sph_shavite512_aesni_init
#define sph_shavite512 sph_shavite512_aesni
#define sph_shavite512_close sph_shavite512_aesni_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_aesni_addbits_and_close
#else
#define sph_shavite512_init sph_shavite512_sw_init
#define sph_shavite512 sph_shavite512_sw
#define sph_shavite512_close sph_shavite512_sw_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_sw_addbits_and_close
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
#if defined (SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#else
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif

View File

@@ -342,17 +342,6 @@ do { \
do { \
sph_u64 t0, t1, t2; \
__m256i h8; \
/* can LE be assumed? \
dec64le does nothing when SPH_LITTLE endian is set, as it is. \
__m256i m0 = _mm256_dec64le( buf ); \
__m256i m1 = _mm256_dec64le( buf + 8*4 ); \
__m256i m2 = _mm256_dec64le( buf + 16*4 ); \
__m256i m3 = _mm256_dec64le( buf + 24*4 ); \
__m256i m4 = _mm256_dec64le( buf + 32*4 ); \
__m256i m5 = _mm256_dec64le( buf + 40*4 ); \
__m256i m6 = _mm256_dec64le( buf + 48*4 ); \
__m256i m7 = _mm256_dec64le( buf + 56*4 ); \
*/ \
__m256i m0 = buf[0]; \
__m256i m1 = buf[1]; \
__m256i m2 = buf[2]; \

View File

@@ -39,7 +39,9 @@
*/
#ifndef __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__ 1
#ifdef __AVX2__
#ifdef __cplusplus
extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
#define SPH_SIZE_skein256 256
#define SPH_SIZE_skein512 512
#ifdef __AVX2__
typedef struct {
__m256i buf[8] __attribute__ ((aligned (32)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein512_4way_context;
} sph_skein_4way_big_context;
typedef sph_skein_4way_big_context skein512_4way_context;
typedef sph_skein_4way_big_context skein256_4way_context;
void skein512_4way_init(void *cc);
void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
//void sph_skein512_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[8] __attribute__ ((aligned (32)));
__m128i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein256_4way_context;
void skein256_4way_init(void *cc);
void skein256_4way(void *cc, const void *data, size_t len);
void skein256_4way_close(void *cc, void *dst);
//void sph_skein256_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif
#endif

231
algo/sm3/sm3-hash-4way.c Normal file
View File

@@ -0,0 +1,231 @@
/* ====================================================================
* Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#include <string.h>
#include "sm3-hash-4way.h"
#ifdef __AVX__
void sm3_4way_init( sm3_4way_ctx_t *ctx )
{
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
}
void sm3_4way( void *cc, const void *data, size_t len )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *block = (__m128i*)ctx->block;
__m128i *vdata = (__m128i*)data;
if ( ctx->num )
{
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
if ( len < left )
{
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
ctx->num += len;
return;
}
else
{
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
sm3_4way_compress( ctx->digest, block );
ctx->nblocks++;
vdata += left>>2;
len -= left;
}
}
while ( len >= SM3_BLOCK_SIZE )
{
sm3_4way_compress( ctx->digest, vdata );
ctx->nblocks++;
vdata += SM3_BLOCK_SIZE>>2;
len -= SM3_BLOCK_SIZE;
}
ctx->num = len;
if ( len )
memcpy_128( block, vdata, len>>2 );
}
void sm3_4way_close( void *cc, void *dst )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *hash = (__m128i*)dst;
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m128i *block = (__m128i*)ctx->block;
int i;
block[ctx->num] = _mm_set1_epi32( 0x80 );
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
}
else
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
sm3_4way_compress( ctx->digest, block );
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm_byteswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] );
}
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
mm_rotl_32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
mm_rotl_32( x, 23 ) ) )
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
_mm_and_si128( x, z ) ), \
_mm_and_si128( y, z ) )
#define GG0(x,y,z) FF0(x,y,z)
#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
_mm_andnot_si128( x, z ) )
void sm3_4way_compress( __m128i *digest, __m128i *block )
{
__m128i W[68], W1[64];
__m128i A = digest[ 0 ];
__m128i B = digest[ 1 ];
__m128i C = digest[ 2 ];
__m128i D = digest[ 3 ];
__m128i E = digest[ 4 ];
__m128i F = digest[ 5 ];
__m128i G = digest[ 6 ];
__m128i H = digest[ 7 ];
__m128i SS1, SS2, TT1, TT2, T;
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
W[ j-9 ] ),
mm_rotl_32( W[ j-3 ], 15 ) ) ),
_mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
W[ j-6 ] ) );
for( j = 0; j < 64; j++ )
W1[j] = _mm_xor_si128( W[j], W[j+4] );
T = _mm_set1_epi32( 0x79CC4519UL );
for( j =0; j < 16; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
T = _mm_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
digest[0] = _mm_xor_si128( digest[0], A );
digest[1] = _mm_xor_si128( digest[1], B );
digest[2] = _mm_xor_si128( digest[2], C );
digest[3] = _mm_xor_si128( digest[3], D );
digest[4] = _mm_xor_si128( digest[4], E );
digest[5] = _mm_xor_si128( digest[5], F );
digest[6] = _mm_xor_si128( digest[6], G );
digest[7] = _mm_xor_si128( digest[7], H );
}
#endif

89
algo/sm3/sm3-hash-4way.h Normal file
View File

@@ -0,0 +1,89 @@
/* ====================================================================
* Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#ifndef SPH_SM3_HASH_4WAY_H
#define SPH_SM3_HASH_4WAY_H
#define SM3_DIGEST_LENGTH 32
#define SM3_BLOCK_SIZE 64
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "avxdefs.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
__m128i block[16] __attribute__ ((aligned (64)));
__m128i digest[8];
uint32_t nblocks;
uint32_t num;
} sm3_4way_ctx_t;
void sm3_4way_init( sm3_4way_ctx_t *ctx );
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
// size_t data_len );
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
// unsigned char digest[SM3_DIGEST_LENGTH] );
void sm3_4way_compress( __m128i *digest, __m128i *block );
void sm3_4way(void *cc, const void *data, size_t len);
void sm3_4way_close(void *cc, void *dst);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
for(j =16; j < 64; j++) {
T[j] = 0x7A879D8A;
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
SS2 = SS1 ^ ROTATELEFT(A,12);
TT1 = FF1(A,B,C) + D + SS2 + W1[j];
TT2 = GG1(E,F,G) + H + SS1 + W[j];

View File

@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
#endif
uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state;
//uint64_t *b= (uint64_t*)sc->buf;
//uint64_t *s= (uint64_t*)sc->state;
// printf("Sptr 1= %u\n",current);
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );

View File

@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
for (i = 0; i < 8; i ++) \
sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
}
// sph_ ## name ## _init(cc); \
//}
/*
sph_ ## name ## _init(cc); \
}
*/
MAKE_CLOSE(whirlpool)
MAKE_CLOSE(whirlpool0)
MAKE_CLOSE(whirlpool1)

View File

@@ -1,4 +1,7 @@
#include "whirlpool-gate.h"
#if defined(__AVX2__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -6,8 +9,6 @@
#include "sph_whirlpool.h"
#include "whirlpool-hash-4way.h"
#if defined(__AVX2__)
static __thread whirlpool_4way_context whirl_mid;
void whirlpool_hash_4way( void *state, const void *input )
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
}
int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done )
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
// if (opt_benchmark)
// ((uint32_t*)ptarget)[7] = 0x0000ff;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);

View File

@@ -2,14 +2,16 @@
bool register_whirlpool_algo( algo_gate_t* gate )
{
//#if defined (WHIRLPOOL_4WAY)
// gate->scanhash = (void*)&scanhash_whirlpool_4way;
// gate->hash = (void*)&whirlpool_hash_4way;
//#else
#if defined (WHIRLPOOL_4WAY)
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_whirlpool_4way;
gate->hash = (void*)&whirlpool_hash_4way;
#else
gate->scanhash = (void*)&scanhash_whirlpool;
gate->hash = (void*)&whirlpool_hash;
init_whirlpool_ctx();
//#endif
#endif
return true;
};

View File

@@ -4,21 +4,25 @@
#include "algo-gate-api.h"
#include <stdint.h>
/*
#if defined(FOUR_WAY) && defined(__AVX2__)
#define WHIRLPOOL_4WAY
#endif
*/
//#if defined (WHIRLPOOL_4WAY)
#if defined (WHIRLPOOL_4WAY)
//void whirlpool_hash_4way(void *state, const void *input);
void whirlpool_hash_4way(void *state, const void *input);
//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
//#endif
int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#else
void whirlpool_hash( void *state, const void *input );
int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_whirlpool_ctx();
#endif
#endif

View File

@@ -3345,8 +3345,12 @@ do { \
#define READ_STATE MUL8(READ_STATE_W)
#define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
/*
#define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
*/
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
// A very complex, but structured, expression with a mix of scalar
// and vector operations to retrieve specific 64 bit constants from
@@ -3357,21 +3361,51 @@ do { \
// Extract 64 bit vector elements from "in" representing offsets. Unmask the
// low byte of each and scale for use as vector indexes.
// Pack the data in a vector and return it.
/*
#define t_row( inv, row ) \
_mm256_and_si256( \
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
// Extract vector element from "lane" of vector "in[row]" and use it to index
// scalar array of constants "table" and return referenced 64 bit entry.
#define t_lane( table, inv, row, lane ) \
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
*/
// Build a vector from elements of non-contiguous 64 bit data extracted from
// scalar "table".
// reference scalar version 1480 kH/s
/*
// version 1, extract with gather
// 955 kH/s
#define t_lane( inv, row, lane ) \
BYTE( _mm256_extract_epi64( inv, lane ), row ) \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
_mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
t_lane( inv, row, 0) ), 1 )
*/
/*
// version 2, extract with set
// 1100 kH/s
#define t_lane( table, inv, row, lane ) \
table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
*/
// version 3, vector indexing with set
// 1105 kH/s
#define t_lane( table, inv, row, lane ) \
table[ BYTE( inv[ lane ], row ) ] \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
#if SPH_SMALL_FOOTPRINT_WHIRLPOOL

252
algo/x11/c11-4way.c Normal file
View File

@@ -0,0 +1,252 @@
#include "cpuminer-config.h"
#include "c11-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} c11_4way_ctx_holder;
c11_4way_ctx_holder c11_4way_ctx;
void init_c11_4way_ctx()
{
blake512_4way_init( &c11_4way_ctx.blake );
bmw512_4way_init( &c11_4way_ctx.bmw );
init_groestl( &c11_4way_ctx.groestl, 64 );
skein512_4way_init( &c11_4way_ctx.skein );
jh512_4way_init( &c11_4way_ctx.jh );
keccak512_4way_init( &c11_4way_ctx.keccak );
init_luffa( &c11_4way_ctx.luffa, 512 );
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_4way_ctx.shavite );
init_sd( &c11_4way_ctx.simd, 512 );
init_echo( &c11_4way_ctx.echo, 512 );
}
void c11_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
c11_4way_ctx_holder ctx;
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 5 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// 6 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
c11_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x11/c11-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "c11-gate.h"
bool register_c11_algo( algo_gate_t* gate )
{
#if defined (C11_4WAY)
init_c11_4way_ctx();
gate->scanhash = (void*)&scanhash_c11_4way;
gate->hash = (void*)&c11_4way_hash;
#else
init_c11_ctx();
gate->scanhash = (void*)&scanhash_c11;
gate->hash = (void*)&c11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x11/c11-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef C11_GATE_H__
#define C11_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define C11_4WAY
#endif
bool register_c11_algo( algo_gate_t* gate );
#if defined(C11_4WAY)
void c11_4way_hash( void *state, const void *input );
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_c11_4way_ctx();
#endif
void c11_hash( void *state, const void *input );
int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_c11_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "c11-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
#endif
}
void c11hash( void *output, const void *input )
void c11_hash( void *output, const void *input )
{
unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
// uint32_t _ALIGN(64) hash[16];
@@ -157,12 +157,13 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
do
{
be32enc( &endiandata[19], nonce );
c11hash( hash, endiandata );
c11_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
@@ -171,13 +172,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_c11_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_c11_ctx();
gate->scanhash = (void*)&scanhash_c11;
gate->hash = (void*)&c11hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

274
algo/x11/timetravel-4way.c Normal file
View File

@@ -0,0 +1,274 @@
#include "timetravel-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
} tt8_4way_ctx_holder;
tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
void init_tt8_4way_ctx()
{
blake512_4way_init( &tt8_4way_ctx.blake );
bmw512_4way_init( &tt8_4way_ctx.bmw );
init_groestl( &tt8_4way_ctx.groestl, 64 );
skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
};
void timetravel_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel-gate.h"
void tt8_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL_4WAY
init_tt8_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel_4way;
gate->hash = (void*)&timetravel_4way_hash;
#else
init_tt8_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
#endif
gate->set_target = (void*)&tt8_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
void tt8_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,40 @@
#ifndef TIMETRAVEL_GATE_H__
#define TIMETRAVEL_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL_4WAY
#endif
// Machinecoin Genesis Timestamp
#define TT8_FUNC_BASE_TIMESTAMP 1389040865
#define TT8_FUNC_COUNT 8
#define TT8_FUNC_COUNT_PERMUTATIONS 40320
void tt8_next_permutation( int *pbegin, int *pend );
bool register_timetravel_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL_4WAY)
void timetravel_4way_hash( void *state, const void *input );
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_4way_ctx();
#endif
void timetravel_hash( void *state, const void *input );
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_ctx();
#endif

View File

@@ -1,11 +1,9 @@
#include "algo-gate-api.h"
#include "timetravel-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"
#else
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
// Machinecoin Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1389040865
#define HASH_FUNC_COUNT 8
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
typedef struct {
sph_blake512_context blake;
@@ -101,7 +38,7 @@ typedef struct {
tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
__thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));
void init_tt_ctx()
void init_tt8_ctx()
{
sph_blake512_init( &tt_ctx.blake );
sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()
void timetravel_hash(void *output, const void *input)
{
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB;
tt_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{
if (i == 0)
{
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
}
}
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
}
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp;
// do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
void timetravel_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
gate->set_target = (void*)&timetravel_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -0,0 +1,316 @@
#include "timetravel10-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
} tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
void init_tt10_4way_ctx()
{
blake512_4way_init( &tt10_4way_ctx.blake );
bmw512_4way_init( &tt10_4way_ctx.bmw );
init_groestl( &tt10_4way_ctx.groestl, 64 );
skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 );
};
void timetravel10_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel10_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel10-gate.h"
void tt10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL10_4WAY
init_tt10_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel10_4way;
gate->hash = (void*)&timetravel10_4way_hash;
#else
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
#endif
gate->set_target = (void*)&tt10_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
void tt10_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,39 @@
#ifndef TIMETRAVEL10_GATE_H__
#define TIMETRAVEL10_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL10_4WAY
#endif
// BitCore Genesis Timestamp
#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
#define TT10_FUNC_COUNT 10
#define TT10_FUNC_COUNT_PERMUTATIONS 40320
void tt10_next_permutation( int *pbegin, int *pend );
bool register_timetravel10_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL10_4WAY)
void timetravel10_4way_hash( void *state, const void *input );
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done );
void init_tt10_4way_ctx();
#endif
void timetravel10_hash( void *state, const void *input );
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt10_ctx();
#endif

View File

@@ -1,11 +1,8 @@
#include "algo-gate-api.h"
#include "timetravel10-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
// BitCore Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
#define HASH_FUNC_COUNT 10
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
typedef struct {
sph_blake512_context blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()
void timetravel10_hash(void *output, const void *input)
{
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB;
tt10_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)
memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{
if (i == 0)
{
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
}
}
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
}
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp;
// do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
{
work_set_target_ratio( work, hash );
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void timetravel10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
gate->set_target = (void*)&timetravel10_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -10,8 +10,14 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
static __thread jh512_4way_context ctx_mid;
/*
void init_tribus_4way_ctx()
{
init_echo( &tribus_4way_ctx, 512 );
}
*/
void tribus_hash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));

View File

@@ -1,22 +1,11 @@
#include "tribus-gate.h"
/*
bool tribus_thread_init()
{
sph_jh512_init( &tribus_ctx.jh );
sph_keccak512_init( &tribus_ctx.keccak );
#ifdef NO_AES_NI
sph_echo512_init( &tribus_ctx.echo );
#else
init_echo( &tribus_ctx.echo, 512 );
#endif
return true;
}
*/
bool register_tribus_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
// init_tribus_4way_ctx();
gate->scanhash = (void*)&scanhash_tribus_4way;
gate->hash = (void*)&tribus_hash_4way;
#else

View File

@@ -4,12 +4,14 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#if defined(HASH_4WAY) && defined(__AES__)
#define TRIBUS_4WAY
#endif
#if defined(TRIBUS_4WAY)
//void init_tribus_4way_ctx();
void tribus_hash_4way( void *state, const void *input );
int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,

252
algo/x11/x11-4way.c Normal file
View File

@@ -0,0 +1,252 @@
#include "cpuminer-config.h"
#include "x11-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11_4way_ctx_holder;
x11_4way_ctx_holder x11_4way_ctx;
void init_x11_4way_ctx()
{
blake512_4way_init( &x11_4way_ctx.blake );
bmw512_4way_init( &x11_4way_ctx.bmw );
init_groestl( &x11_4way_ctx.groestl, 64 );
skein512_4way_init( &x11_4way_ctx.skein );
jh512_4way_init( &x11_4way_ctx.jh );
keccak512_4way_init( &x11_4way_ctx.keccak );
init_luffa( &x11_4way_ctx.luffa, 512 );
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_4way_ctx.shavite );
init_sd( &x11_4way_ctx.simd, 512 );
init_echo( &x11_4way_ctx.echo, 512 );
}
void x11_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11_4way_ctx_holder ctx;
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x11/x11-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x11-gate.h"
bool register_x11_algo( algo_gate_t* gate )
{
#if defined (X11_4WAY)
init_x11_4way_ctx();
gate->scanhash = (void*)&scanhash_x11_4way;
gate->hash = (void*)&x11_4way_hash;
#else
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x11/x11-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X11_GATE_H__
#define X11_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11_4WAY
#endif
bool register_x11_algo( algo_gate_t* gate );
#if defined(X11_4WAY)
void x11_4way_hash( void *state, const void *input );
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11_4way_ctx();
#endif
void x11_hash( void *state, const void *input );
int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "x11-gate.h"
#include <string.h>
#include <stdint.h>
@@ -61,7 +61,7 @@ void init_x11_ctx()
#endif
}
static void x11_hash( void *state, const void *input )
void x11_hash( void *state, const void *input )
{
unsigned char hash[128] __attribute__ ((aligned (32)));
unsigned char hashbuf[128] __attribute__ ((aligned (16)));
@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
}
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
bool register_x11_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

340
algo/x11/x11evo-4way.c Normal file
View File

@@ -0,0 +1,340 @@
#include "cpuminer-config.h"
#include "x11evo-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <string.h>
#include <stdint.h>
#include <compat/portable_endian.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11evo_4way_ctx_holder;
static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
void init_x11evo_4way_ctx()
{
blake512_4way_init( &x11evo_4way_ctx.blake );
bmw512_4way_init( &x11evo_4way_ctx.bmw );
init_groestl( &x11evo_4way_ctx.groestl, 64 );
skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 );
}
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
void x11evo_4way_hash( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
if ( s_seq == -1 )
{
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = data[17];
evo_twisted_code( ntime, hashOrder );
}
int i;
int len = strlen( hashOrder );
for ( i = 0; i < len; i++ )
{
char elem = hashOrder[i];
uint8_t idx;
if ( elem >= 'A' )
idx = elem - 'A' + 10;
else
idx = elem - '0';
// int size = 64;
switch ( idx )
{
case 0:
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*) hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 10:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
}
}
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
//static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
swab32_array( endiandata, pdata, 20 );
int ntime = endiandata[17];
if ( ntime != s_ntime || s_seq == -1 )
{
evo_twisted_code( ntime, hashOrder );
s_ntime = ntime;
}
uint32_t hmask = 0xFFFFFFFF;
if ( Htarg > 0 )
{
if ( Htarg <= 0xF )
hmask = 0xFFFFFFF0;
else if ( Htarg <= 0xFF )
hmask = 0xFFFFFF00;
else if ( Htarg <= 0xFFF )
hmask = 0xFFFF000;
else if ( Htarg <= 0xFFFF )
hmask = 0xFFFF000;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11evo_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

95
algo/x11/x11evo-gate.c Normal file
View File

@@ -0,0 +1,95 @@
#include "x11evo-gate.h"
int s_seq = -1;
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[X11EVO_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, X11EVO_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, X11EVO_FUNC_COUNT );
sptr = str;
for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
void evo_twisted_code( uint32_t ntime, char *permstr )
{
int seq = getCurrentAlgoSeq( ntime );
if ( s_seq != seq )
{
getAlgoString( permstr, seq );
s_seq = seq;
}
}
bool register_x11evo_algo( algo_gate_t* gate )
{
#if defined (X11EVO_4WAY)
init_x11evo_4way_ctx();
gate->scanhash = (void*)&scanhash_x11evo_4way;
gate->hash = (void*)&x11evo_4way_hash;
#else
init_x11evo_ctx();
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

39
algo/x11/x11evo-gate.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef X11EVO_GATE_H__
#define X11EVO_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11EVO_4WAY
#endif
#define X11EVO_INITIAL_DATE 1462060800
#define X11EVO_FUNC_COUNT 11
extern int s_seq;
bool register_x11evo_algo( algo_gate_t* gate );
#if defined(X11EVO_4WAY)
void x11evo_4way_hash( void *state, const void *input );
int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_4way_ctx();
#endif
void x11evo_hash( void *state, const void *input );
int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_ctx();
void evo_twisted_code( uint32_t ntime, char *permstr );
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "x11evo-gate.h"
#include <string.h>
#include <stdint.h>
@@ -26,9 +26,6 @@
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#define INITIAL_DATE 1462060800
#define HASH_FUNC_COUNT 11
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
sph_shavite512_init( &x11evo_ctx.shavite );
}
/*
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
{
return (current_time - base_time) / (60 * 60 * 24);
}
*/
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[HASH_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, HASH_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, HASH_FUNC_COUNT );
sptr = str;
for ( j = 0; j < HASH_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
static int s_seq = -1;
static void evo_twisted_code(uint32_t ntime, char *permstr)
{
int seq = getCurrentAlgoSeq(ntime);
if (s_seq != seq)
{
getAlgoString(permstr, seq);
s_seq = seq;
}
}
static inline void x11evo_hash( void *state, const void *input )
void x11evo_hash( void *state, const void *input )
{
uint32_t hash[16] __attribute__ ((aligned (64)));
x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
memcpy( state, hash, 32 );
}
static const uint32_t diff1targ = 0x0000ffff;
//static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done )
uint64_t *hashes_done )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
else if ( Htarg <= 0xFFF )
hmask = 0xFFFF000;
else if ( Htarg <= 0xFFFF )
hmask = 0xFFFF000;
hmask = 0xFFFF000;
}
do
{
pdata[19] = ++n;
be32enc( &endiandata[19], n );
x11evo_hash( hash64, &endiandata );
x11evo_hash( hash64, endiandata );
if ( ( hash64[7] & hmask ) == 0 )
{
if ( fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
}
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
bool register_x11evo_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
init_x11evo_ctx();
return true;
};

259
algo/x11/x11gost-4way.c Normal file
View File

@@ -0,0 +1,259 @@
#include "cpuminer-config.h"
#include "x11gost-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
sph_gost512_context gost;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11gost_4way_ctx_holder;
x11gost_4way_ctx_holder x11gost_4way_ctx;
void init_x11gost_4way_ctx()
{
blake512_4way_init( &x11gost_4way_ctx.blake );
bmw512_4way_init( &x11gost_4way_ctx.bmw );
init_groestl( &x11gost_4way_ctx.groestl, 64 );
skein512_4way_init( &x11gost_4way_ctx.skein );
jh512_4way_init( &x11gost_4way_ctx.jh );
keccak512_4way_init( &x11gost_4way_ctx.keccak );
sph_gost512_init( &x11gost_4way_ctx.gost );
init_luffa( &x11gost_4way_ctx.luffa, 512 );
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11gost_4way_ctx.shavite );
init_sd( &x11gost_4way_ctx.simd, 512 );
init_echo( &x11gost_4way_ctx.echo, 512 );
}
void x11gost_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512( &ctx.gost, hash2, 64 );
sph_gost512_close( &ctx.gost, hash2 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11gost_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x11/x11gost-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x11gost-gate.h"
bool register_x11gost_algo( algo_gate_t* gate )
{
#if defined (X11GOST_4WAY)
init_x11gost_4way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_4way;
gate->hash = (void*)&x11gost_4way_hash;
#else
init_x11gost_ctx();
gate->scanhash = (void*)&scanhash_x11gost;
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x11/x11gost-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X11GOST_GATE_H__
#define X11GOST_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11GOST_4WAY
#endif
bool register_x11gost_algo( algo_gate_t* gate );
#if defined(X11GOST_4WAY)
void x11gost_4way_hash( void *state, const void *input );
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11gost_4way_ctx();
#endif
void x11gost_hash( void *state, const void *input );
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11gost_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x11gost-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
hashState_echo echo;
hashState_groestl groestl;
#endif
} sib_ctx_holder;
} x11gost_ctx_holder;
sib_ctx_holder sib_ctx;
x11gost_ctx_holder x11gost_ctx;
void init_sib_ctx()
void init_x11gost_ctx()
{
sph_gost512_init(&sib_ctx.gost);
sph_shavite512_init(&sib_ctx.shavite);
init_luffa( &sib_ctx.luffa, 512 );
cubehashInit( &sib_ctx.cube, 512, 16, 32 );
init_sd( &sib_ctx.simd, 512 );
sph_gost512_init( &x11gost_ctx.gost );
sph_shavite512_init( &x11gost_ctx.shavite );
init_luffa( &x11gost_ctx.luffa, 512 );
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
init_sd( &x11gost_ctx.simd, 512 );
#ifdef NO_AES_NI
sph_groestl512_init( &sib_ctx.groestl );
sph_echo512_init( &sib_ctx.echo );
sph_groestl512_init( &x11gost_ctx.groestl );
sph_echo512_init( &x11gost_ctx.echo );
#else
init_echo( &sib_ctx.echo, 512 );
init_groestl( &sib_ctx.groestl, 64 );
init_echo( &x11gost_ctx.echo, 512 );
init_groestl( &x11gost_ctx.groestl, 64 );
#endif
}
void sibhash(void *output, const void *input)
void x11gost_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (64)));
#define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
sph_u64 hashctA;
sph_u64 hashctB;
sib_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
DECL_BLK;
BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
memcpy(output, hashA, 32);
}
int scanhash_sib(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -156,11 +156,12 @@ int scanhash_sib(int thr_id, struct work *work,
do {
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
sibhash(hash, endiandata);
x11gost_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;
@@ -172,12 +173,3 @@ int scanhash_sib(int thr_id, struct work *work,
return 0;
}
bool register_sib_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_sib_ctx();
gate->scanhash = (void*)&scanhash_sib;
gate->hash = (void*)&sibhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

186
algo/x13/phi1612-4way.c Normal file
View File

@@ -0,0 +1,186 @@
#include "x13-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
cubehashParam cube;
sph_fugue512_context fugue;
sph_gost512_context gost;
hashState_echo echo;
} phi1612_4way_ctx_holder;
phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
void init_phi1612_4way_ctx()
{
skein512_4way_init( &phi1612_4way_ctx.skein );
jh512_4way_init( &phi1612_4way_ctx.jh );
cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
sph_fugue512_init( &phi1612_4way_ctx.fugue );
sph_gost512_init( &phi1612_4way_ctx.gost );
init_echo( &phi1612_4way_ctx.echo, 512 );
};
void phi1612_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
phi1612_4way_ctx_holder ctx;
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
// Skein parallel 4way
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// Gost
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash2, 64 );
sph_gost512_close( &ctx.gost, hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t _ALIGN(64) endiandata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
phi1612_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x13/phi1612-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "phi1612-gate.h"
bool register_phi1612_algo( algo_gate_t* gate )
{
#if defined(PHI1612_4WAY)
init_phi1612_4way_ctx();
gate->scanhash = (void*)&scanhash_phi1612_4way;
gate->hash = (void*)&phi1612_4way_hash;
#else
init_phi1612_ctx();
gate->scanhash = (void*)&scanhash_phi1612;
gate->hash = (void*)&phi1612_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

Some files were not shown because too many files have changed in this diff Show More