mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.8
This commit is contained in:
29
Makefile.am
29
Makefile.am
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
|
||||
algo/heavy/sph_hefty1.c \
|
||||
algo/heavy/heavy.c \
|
||||
algo/heavy/bastion.c \
|
||||
algo/hmq1725.c \
|
||||
algo/hodl/aes.c \
|
||||
algo/hodl/hodl-gate.c \
|
||||
algo/hodl/hodl-wolf.c \
|
||||
@@ -110,7 +109,7 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2z330.c \
|
||||
algo/lyra2/lyra2h.c \
|
||||
algo/m7m.c \
|
||||
algo/neoscrypt.c \
|
||||
algo/neoscrypt/neoscrypt.c \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
@@ -159,16 +158,36 @@ cpuminer_SOURCES = \
|
||||
algo/whirlpool/whirlpoolx.c \
|
||||
algo/x11/x11-gate.c \
|
||||
algo/x11/x11.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11-4way.c \
|
||||
algo/x11/x11gost-gate.c \
|
||||
algo/x11/x11gost.c \
|
||||
algo/x11/x11gost-4way.c \
|
||||
algo/x11/c11-gate.c \
|
||||
algo/x11/c11.c \
|
||||
algo/x11/phi1612.c \
|
||||
algo/x11/c11-4way.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x13/x13-gate.c \
|
||||
algo/x13/x13.c \
|
||||
algo/x13/x13-4way.c \
|
||||
algo/x13/x13sm3-gate.c \
|
||||
algo/x13/x13sm3.c \
|
||||
algo/x13/x13sm3-4way.c \
|
||||
algo/x13/phi1612-gate.c \
|
||||
algo/x13/phi1612.c \
|
||||
algo/x13/phi1612-4way.c \
|
||||
algo/x14/x14-gate.c \
|
||||
algo/x14/x14.c \
|
||||
algo/x14/x14-4way.c \
|
||||
algo/x15/x15-gate.c \
|
||||
algo/x15/x15.c \
|
||||
algo/x15/x15-4way.c \
|
||||
algo/x17/x17-gate.c \
|
||||
algo/x17/x17.c \
|
||||
algo/xevan.c \
|
||||
algo/x17/x17-4way.c \
|
||||
algo/x17/xevan-gate.c \
|
||||
algo/x17/xevan.c \
|
||||
algo/x17/xevan-4way.c \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c\
|
||||
algo/yescrypt/yescrypt-simd.c\
|
||||
|
@@ -96,13 +96,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
|
||||
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
|
||||
performance.
|
||||
|
||||
ARM CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
|
||||
may work wallet mining but there are no guarantees.
|
||||
MacOS, OSx is not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork.
|
||||
|
||||
Errata
|
||||
------
|
||||
|
20
README.txt
20
README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
Exe name Compile opts Arch name
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe -march=core2 Core2
|
||||
cpuminer-sse42.exe -march=corei7 Nehalem
|
||||
cpuminer-aes-sse42.exe -maes -msse4.2" Westmere
|
||||
cpuminer-aes-avx.exe -march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-aes-avx2.exe "-march=core-avx2" Haswell, Broadwell, Skylake, Kabylake
|
||||
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY"
|
||||
cpuminer-sse2.exe "-march=core2" Core2
|
||||
cpuminer-sse42.exe "-march=corei7" Nehalem
|
||||
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell...
|
||||
cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen...
|
||||
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2
|
||||
cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
|
||||
|
||||
4way requires a CPU with AES and AVX2. It is still under development and
|
||||
only a few algos are supported. See change log in RELEASE_NOTES in source
|
||||
package for supported algos.
|
||||
|
||||
There is no binary support available for SHA on AMD Ryzen CPUs.
|
||||
Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
|
||||
is provided. Four way still uses AVX2.
|
||||
|
||||
|
@@ -27,8 +27,9 @@ Compile Instructions
|
||||
|
||||
Requirements:
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU.
|
||||
64 bit Linux or Windows operating system.
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
64 bit Linux or Windows operating system. Apple is not supported.
|
||||
|
||||
Building on linux prerequisites:
|
||||
|
||||
@@ -164,6 +165,10 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.7.8
|
||||
|
||||
Partial 4way optimization for most X algos including c11, xevan, phi, hsr
|
||||
|
||||
v3.7.7
|
||||
|
||||
Fixed regression caused by 64 CPU support.
|
||||
@@ -182,7 +187,7 @@ New algo keccakc for Creative coin with 4way optimizations
|
||||
Rewrote some AVX/AVX2 code for more consistent implementation and some
|
||||
optimizing.
|
||||
|
||||
Enhanced capabilities check to support 4way, mor eprecise reporting of
|
||||
Enhanced capabilities check to support 4way, more precise reporting of
|
||||
features (not all algos use SSE2), and better error messages when using
|
||||
an incompatible pre-built version (Windows users).
|
||||
|
||||
|
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
|
||||
case ALGO_X11: register_x11_algo ( gate ); break;
|
||||
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_sib_algo ( gate ); break;
|
||||
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
|
||||
case ALGO_X13: register_x13_algo ( gate ); break;
|
||||
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
|
||||
case ALGO_X14: register_x14_algo ( gate ); break;
|
||||
|
@@ -849,9 +849,9 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
|
||||
sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
|
||||
sc->S[i] = _mm_set1_epi32( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -941,10 +941,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
_mm_set1_epi32( 0x010000000 ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
@@ -955,10 +954,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
memset_zero_128( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf, 64 );
|
||||
}
|
||||
out = (__m128i*)dst;
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define NIST5_4WAY
|
||||
#endif
|
||||
|
||||
|
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input, uint32_t len)
|
||||
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
|
||||
|
||||
SHA256_Update( &ctx_sha256, input + midlen, tail );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
|
||||
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
|
||||
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
|
||||
SHA256_Update( &ctx_sha256, hashA, 32 );
|
||||
SHA256_Final( hashA, &ctx_sha256 );
|
||||
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
|
||||
#else
|
||||
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
|
||||
|
@@ -267,9 +267,6 @@ c512(sph_shavite_big_context *sc, const void *msg)
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* This function assumes that "msg" is aligned for 32-bit access.
|
||||
*/
|
||||
static void
|
||||
c512( sph_shavite_big_context *sc, const void *msg )
|
||||
{
|
||||
@@ -379,36 +376,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
@@ -461,36 +458,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, mm_zero );
|
||||
|
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
|
||||
|
||||
bool register_skein_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
|
||||
#if defined (SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
|
@@ -10,8 +10,14 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
|
||||
static __thread jh512_4way_context ctx_mid;
|
||||
|
||||
/*
|
||||
void init_tribus_4way_ctx()
|
||||
{
|
||||
init_echo( &tribus_4way_ctx, 512 );
|
||||
}
|
||||
*/
|
||||
void tribus_hash_4way(void *state, const void *input)
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
|
@@ -1,22 +1,11 @@
|
||||
#include "tribus-gate.h"
|
||||
/*
|
||||
bool tribus_thread_init()
|
||||
{
|
||||
sph_jh512_init( &tribus_ctx.jh );
|
||||
sph_keccak512_init( &tribus_ctx.keccak );
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_init( &tribus_ctx.echo );
|
||||
#else
|
||||
init_echo( &tribus_ctx.echo, 512 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_tribus_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x1ffff;
|
||||
#if defined (TRIBUS_4WAY)
|
||||
// init_tribus_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_tribus_4way;
|
||||
gate->hash = (void*)&tribus_hash_4way;
|
||||
#else
|
||||
|
@@ -4,12 +4,14 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define TRIBUS_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(TRIBUS_4WAY)
|
||||
|
||||
//void init_tribus_4way_ctx();
|
||||
|
||||
void tribus_hash_4way( void *state, const void *input );
|
||||
|
||||
int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
@@ -4,6 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_whirlpool_4way;
|
||||
gate->hash = (void*)&whirlpool_hash_4way;
|
||||
#else
|
||||
|
@@ -4,9 +4,11 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
#if defined(FOUR_WAY) && defined(__AVX2__)
|
||||
#define WHIRLPOOL_4WAY
|
||||
#endif
|
||||
*/
|
||||
|
||||
#if defined (WHIRLPOOL_4WAY)
|
||||
|
||||
|
@@ -3345,8 +3345,10 @@ do { \
|
||||
#define READ_STATE MUL8(READ_STATE_W)
|
||||
#define ROUND0 MUL8(ROUND0_W)
|
||||
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
|
||||
#define BYTE(x, n) \
|
||||
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
|
||||
//#define BYTE(x, n) \
|
||||
// _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
|
||||
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
|
||||
|
||||
|
||||
// A very complex, but structured, expression with a mix of scalar
|
||||
// and vector operations to retrieve specific 64 bit constants from
|
||||
@@ -3357,23 +3359,51 @@ do { \
|
||||
// Extract 64 bit vector elements from "in" representing offsets. Unmask the
|
||||
// low byte of each and scale for use as vector indexes.
|
||||
// Pack the data in a vector and return it.
|
||||
|
||||
/*
|
||||
#define t_row( inv, row ) \
|
||||
_mm256_and_si256( \
|
||||
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
|
||||
|
||||
// Extract vector element from "lane" of vector "in[row]" and use it to index
|
||||
// scalar array of constants "table" and return referenced 64 bit entry.
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
|
||||
// table[ t_rwo( inv, row )[ lane ] ];
|
||||
|
||||
*/
|
||||
|
||||
// Build a vector from elements of non-contiguous 64 bit data extracted from
|
||||
// scalar "table".
|
||||
// reference scalar version 1480 kH/s
|
||||
/*
|
||||
// version 1, extract with gather
|
||||
// 955 kH/s
|
||||
#define t_lane( inv, row, lane ) \
|
||||
BYTE( _mm256_extract_epi64( inv, lane ), row ) \
|
||||
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
_mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
|
||||
t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
|
||||
t_lane( inv, row, 0) ), 1 )
|
||||
*/
|
||||
/*
|
||||
// version 2, extract with set
|
||||
// 1100 kH/s
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
*/
|
||||
|
||||
// version 3, vector indexing with set
|
||||
// 1105 kH/s
|
||||
#define t_lane( table, inv, row, lane ) \
|
||||
table[ BYTE( inv[ lane ], row ) ] \
|
||||
|
||||
#define t_vec( table, inv, row ) \
|
||||
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
|
||||
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
|
||||
t_lane( table, inv, row, 0 ) )
|
||||
|
||||
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_WHIRLPOOL
|
||||
|
||||
|
261
algo/x11/c11-4way.c
Normal file
261
algo/x11/c11-4way.c
Normal file
@@ -0,0 +1,261 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "c11-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} c11_4way_ctx_holder;
|
||||
|
||||
c11_4way_ctx_holder c11_4way_ctx;
|
||||
|
||||
void init_c11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &c11_4way_ctx.blake );
|
||||
sph_bmw512_init( &c11_4way_ctx.bmw );
|
||||
init_groestl( &c11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
keccak512_4way_init( &c11_4way_ctx.keccak );
|
||||
init_luffa( &c11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_4way_ctx.shavite );
|
||||
init_sd( &c11_4way_ctx.simd, 512 );
|
||||
init_echo( &c11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void c11_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
c11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
c11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x11/c11-gate.c
Normal file
18
algo/x11/c11-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "c11-gate.h"
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (C11_4WAY)
|
||||
init_c11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||
gate->hash = (void*)&c11_4way_hash;
|
||||
#else
|
||||
init_c11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x11/c11-gate.h
Normal file
32
algo/x11/c11-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef C11_GATE_H__
|
||||
#define C11_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define C11_4WAY
|
||||
#endif
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(C11_4WAY)
|
||||
|
||||
void c11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_c11_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void c11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_c11_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "c11-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -64,7 +64,7 @@ void init_c11_ctx()
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11hash( void *output, const void *input )
|
||||
void c11_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
@@ -157,7 +157,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
be32enc( &endiandata[19], nonce );
|
||||
c11hash( hash, endiandata );
|
||||
c11_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
@@ -171,13 +171,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_c11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
261
algo/x11/x11-4way.c
Normal file
261
algo/x11/x11-4way.c
Normal file
@@ -0,0 +1,261 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} x11_4way_ctx_holder;
|
||||
|
||||
x11_4way_ctx_holder x11_4way_ctx;
|
||||
|
||||
void init_x11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11_4way_ctx.blake );
|
||||
sph_bmw512_init( &x11_4way_ctx.bmw );
|
||||
init_groestl( &x11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11_4way_ctx.skein );
|
||||
jh512_4way_init( &x11_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11_4way_ctx.keccak );
|
||||
init_luffa( &x11_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_4way_ctx.shavite );
|
||||
init_sd( &x11_4way_ctx.simd, 512 );
|
||||
init_echo( &x11_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x11_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x11_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
@@ -5,13 +5,13 @@ bool register_x11_algo( algo_gate_t* gate )
|
||||
#if defined (X11_4WAY)
|
||||
init_x11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11_4way;
|
||||
gate->hash = (void*)&x11_hash_4way;
|
||||
gate->hash = (void*)&x11_4way_hash;
|
||||
#else
|
||||
init_x11_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
@@ -4,19 +4,21 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
|
||||
// #define X11_4WAY
|
||||
//#endif
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X11_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11_4WAY)
|
||||
|
||||
void x11_hash_4way( void *state, const void *input );
|
||||
void x11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x11_hash( void *state, const void *input );
|
||||
|
268
algo/x11/x11gost-4way.c
Normal file
268
algo/x11/x11gost-4way.c
Normal file
@@ -0,0 +1,268 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#if defined (__AVX2__) && defined (__AES__)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
} x11gost_4way_ctx_holder;
|
||||
|
||||
x11gost_4way_ctx_holder x11gost_4way_ctx;
|
||||
|
||||
void init_x11gost_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11gost_4way_ctx.blake );
|
||||
sph_bmw512_init( &x11gost_4way_ctx.bmw );
|
||||
init_groestl( &x11gost_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11gost_4way_ctx.skein );
|
||||
jh512_4way_init( &x11gost_4way_ctx.jh );
|
||||
keccak512_4way_init( &x11gost_4way_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_4way_ctx.gost );
|
||||
init_luffa( &x11gost_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11gost_4way_ctx.shavite );
|
||||
init_sd( &x11gost_4way_ctx.simd, 512 );
|
||||
init_echo( &x11gost_4way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11gost_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x11gost_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
|
||||
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x11gost_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x11/x11gost-gate.c
Normal file
18
algo/x11/x11gost-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11GOST_4WAY)
|
||||
init_x11gost_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
||||
gate->hash = (void*)&x11gost_4way_hash;
|
||||
#else
|
||||
init_x11gost_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x11/x11gost-gate.h
Normal file
32
algo/x11/x11gost-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X11GOST_GATE_H__
|
||||
#define X11GOST_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X11GOST_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11GOST_4WAY)
|
||||
|
||||
void x11gost_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11gost_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x11gost_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x11gost_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -37,28 +37,28 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
} sib_ctx_holder;
|
||||
} x11gost_ctx_holder;
|
||||
|
||||
sib_ctx_holder sib_ctx;
|
||||
x11gost_ctx_holder x11gost_ctx;
|
||||
|
||||
void init_sib_ctx()
|
||||
void init_x11gost_ctx()
|
||||
{
|
||||
sph_gost512_init(&sib_ctx.gost);
|
||||
sph_shavite512_init(&sib_ctx.shavite);
|
||||
init_luffa( &sib_ctx.luffa, 512 );
|
||||
cubehashInit( &sib_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &sib_ctx.simd, 512 );
|
||||
sph_gost512_init( &x11gost_ctx.gost );
|
||||
sph_shavite512_init( &x11gost_ctx.shavite );
|
||||
init_luffa( &x11gost_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &x11gost_ctx.simd, 512 );
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &sib_ctx.groestl );
|
||||
sph_echo512_init( &sib_ctx.echo );
|
||||
sph_groestl512_init( &x11gost_ctx.groestl );
|
||||
sph_echo512_init( &x11gost_ctx.echo );
|
||||
#else
|
||||
init_echo( &sib_ctx.echo, 512 );
|
||||
init_groestl( &sib_ctx.groestl, 64 );
|
||||
init_echo( &x11gost_ctx.echo, 512 );
|
||||
init_groestl( &x11gost_ctx.groestl, 64 );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void sibhash(void *output, const void *input)
|
||||
void x11gost_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (64)));
|
||||
#define hashA hash
|
||||
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
|
||||
sib_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
|
||||
x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
|
||||
memcpy(output, hashA, 32);
|
||||
}
|
||||
|
||||
int scanhash_sib(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -156,7 +156,7 @@ int scanhash_sib(int thr_id, struct work *work,
|
||||
do {
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
sibhash(hash, endiandata);
|
||||
x11gost_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
@@ -172,12 +172,3 @@ int scanhash_sib(int thr_id, struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_sib_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_sib_ctx();
|
||||
gate->scanhash = (void*)&scanhash_sib;
|
||||
gate->hash = (void*)&sibhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
||||
|
186
algo/x13/phi1612-4way.c
Normal file
186
algo/x13/phi1612-4way.c
Normal file
@@ -0,0 +1,186 @@
|
||||
#include "x13-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
cubehashParam cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
hashState_echo echo;
|
||||
} phi1612_4way_ctx_holder;
|
||||
|
||||
phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_phi1612_4way_ctx()
|
||||
{
|
||||
skein512_4way_init( &phi1612_4way_ctx.skein );
|
||||
jh512_4way_init( &phi1612_4way_ctx.jh );
|
||||
cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &phi1612_4way_ctx.fugue );
|
||||
sph_gost512_init( &phi1612_4way_ctx.gost );
|
||||
init_echo( &phi1612_4way_ctx.echo, 512 );
|
||||
};
|
||||
|
||||
void phi1612_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
phi1612_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
|
||||
|
||||
// Skein parallel 4way
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// Gost
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
// Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0cff;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
phi1612_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x13/phi1612-gate.c
Normal file
18
algo/x13/phi1612-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "phi1612-gate.h"
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(PHI1612_4WAY)
|
||||
init_phi1612_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612_4way;
|
||||
gate->hash = (void*)&phi1612_4way_hash;
|
||||
#else
|
||||
init_phi1612_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612;
|
||||
gate->hash = (void*)&phi1612_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x13/phi1612-gate.h
Normal file
32
algo/x13/phi1612-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef PHI1612_GATE_H__
|
||||
#define PHI1612_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define PHI1612_4WAY
|
||||
#endif
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(PHI1612_4WAY)
|
||||
|
||||
void phi1612_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_phi1612_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void phi1612_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_phi1612_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "phi1612-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -33,7 +33,7 @@ phi_ctx_holder phi_ctx;
|
||||
static __thread sph_skein512_context phi_skein_mid
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void init_phi_ctx()
|
||||
void init_phi1612_ctx()
|
||||
{
|
||||
sph_skein512_init( &phi_ctx.skein );
|
||||
sph_jh512_init( &phi_ctx.jh );
|
||||
@@ -53,7 +53,7 @@ void phi_skein_midstate( const void* input )
|
||||
sph_skein512( &phi_skein_mid, input, 64 );
|
||||
}
|
||||
|
||||
void phi1612hash(void *output, const void *input)
|
||||
void phi1612_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (64)));
|
||||
phi_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
@@ -112,7 +112,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
do {
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
phi1612hash(hash, endiandata);
|
||||
phi1612_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
@@ -128,12 +128,3 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_phi_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612;
|
||||
gate->hash = (void*)&phi1612hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
293
algo/x13/x13-4way.c
Normal file
293
algo/x13/x13-4way.c
Normal file
@@ -0,0 +1,293 @@
|
||||
#include "x13-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
} x13_4way_ctx_holder;
|
||||
|
||||
x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x13_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x13_4way_ctx.blake );
|
||||
sph_bmw512_init( &x13_4way_ctx.bmw );
|
||||
init_groestl( &x13_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x13_4way_ctx.skein );
|
||||
jh512_4way_init( &x13_4way_ctx.jh );
|
||||
keccak512_4way_init( &x13_4way_ctx.keccak );
|
||||
init_luffa( &x13_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13_4way_ctx.shavite );
|
||||
init_sd( &x13_4way_ctx.simd, 512 );
|
||||
init_echo( &x13_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x13_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x13_4way_ctx.fugue );
|
||||
};
|
||||
|
||||
void x13_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x13_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512( &ctx.hamsi, hash0, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
// 13 Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x13_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x13/x13-gate.c
Normal file
18
algo/x13/x13-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x13-gate.h"
|
||||
|
||||
bool register_x13_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X13_4WAY)
|
||||
init_x13_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13_4way;
|
||||
gate->hash = (void*)&x13_4way_hash;
|
||||
#else
|
||||
init_x13_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13;
|
||||
gate->hash = (void*)&x13hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x13/x13-gate.h
Normal file
32
algo/x13/x13-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X13_GATE_H__
|
||||
#define X13_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X13_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x13_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X13_4WAY)
|
||||
|
||||
void x13_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x13_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x13hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x13( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x13_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x13-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -68,7 +68,7 @@ void init_x13_ctx()
|
||||
sph_fugue512_init( &x13_ctx.fugue );
|
||||
};
|
||||
|
||||
static void x13hash(void *output, const void *input)
|
||||
void x13hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#define hashB hash+64
|
||||
@@ -249,15 +249,3 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
bool register_x13_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_x13_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13;
|
||||
gate->hash = (void*)&x13hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
328
algo/x13/x13sm3-4way.c
Normal file
328
algo/x13/x13sm3-4way.c
Normal file
@@ -0,0 +1,328 @@
|
||||
#include "x13sm3-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/sm3/sph_sm3.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sm3_ctx_t sm3;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
} x13sm3_4way_ctx_holder;
|
||||
|
||||
x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64)));
|
||||
static __thread blake512_4way_context x13sm3_ctx_mid;
|
||||
|
||||
void init_x13sm3_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x13sm3_4way_ctx.blake );
|
||||
sph_bmw512_init( &x13sm3_4way_ctx.bmw );
|
||||
init_groestl( &x13sm3_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x13sm3_4way_ctx.skein );
|
||||
jh512_4way_init( &x13sm3_4way_ctx.jh );
|
||||
keccak512_4way_init( &x13sm3_4way_ctx.keccak );
|
||||
init_luffa( &x13sm3_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13sm3_4way_ctx.shavite );
|
||||
init_sd( &x13sm3_4way_ctx.simd, 512 );
|
||||
init_echo( &x13sm3_4way_ctx.echo, 512 );
|
||||
sm3_init( &x13sm3_4way_ctx.sm3 );
|
||||
sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x13sm3_4way_ctx.fugue );
|
||||
};
|
||||
|
||||
void x13sm3_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x13sm3_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) );
|
||||
|
||||
// Blake
|
||||
memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
|
||||
blake512_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
|
||||
// blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// SM3
|
||||
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
|
||||
memset( sm3_hash0, 0, sizeof sm3_hash0 );
|
||||
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
|
||||
memset( sm3_hash1, 0, sizeof sm3_hash1 );
|
||||
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
|
||||
memset( sm3_hash2, 0, sizeof sm3_hash2 );
|
||||
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
|
||||
memset( sm3_hash3, 0, sizeof sm3_hash3 );
|
||||
|
||||
sph_sm3( &ctx.sm3, hash0, 64 );
|
||||
sph_sm3_close( &ctx.sm3, sm3_hash0 );
|
||||
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
|
||||
sph_sm3( &ctx.sm3, hash1, 64 );
|
||||
sph_sm3_close( &ctx.sm3, sm3_hash1 );
|
||||
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
|
||||
sph_sm3( &ctx.sm3, hash2, 64 );
|
||||
sph_sm3_close( &ctx.sm3, sm3_hash2 );
|
||||
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
|
||||
sph_sm3( &ctx.sm3, hash3, 64 );
|
||||
sph_sm3_close( &ctx.sm3, sm3_hash3 );
|
||||
|
||||
// Hamsi
|
||||
sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
// Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
blake512_4way_init( &x13sm3_ctx_mid );
|
||||
blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x13sm3_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x13/x13sm3-gate.c
Normal file
18
algo/x13/x13sm3-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x13sm3-gate.h"
|
||||
|
||||
bool register_x13sm3_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X13SM3_4WAY)
|
||||
init_x13sm3_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13sm3_4way;
|
||||
gate->hash = (void*)&x13sm3_4way_hash;
|
||||
#else
|
||||
init_x13sm3_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13sm3;
|
||||
gate->hash = (void*)&x13sm3_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x13/x13sm3-gate.h
Normal file
32
algo/x13/x13sm3-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X13SM3_GATE_H__
|
||||
#define X13SM3_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X13SM3_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x13sm3_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X13SM3_4WAY)
|
||||
|
||||
void x13sm3_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x13sm3_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x13sm3_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x13sm3( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x13sm3_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x13sm3-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -49,7 +49,7 @@ typedef struct {
|
||||
|
||||
hsr_ctx_holder hsr_ctx;
|
||||
|
||||
void init_hsr_ctx()
|
||||
void init_x13sm3_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&hsr_ctx.groestl);
|
||||
@@ -67,7 +67,7 @@ void init_hsr_ctx()
|
||||
sph_fugue512_init(&hsr_ctx.fugue);
|
||||
};
|
||||
|
||||
static void x13sm3hash(void *output, const void *input)
|
||||
void x13sm3_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
|
||||
@@ -213,7 +213,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x13sm3hash(hash64, endiandata);
|
||||
x13sm3_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
@@ -240,13 +240,3 @@ int scanhash_x13sm3( int thr_id, struct work *work,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_x13sm3_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_hsr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x13sm3;
|
||||
gate->hash = (void*)&x13sm3hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
310
algo/x14/x14-4way.c
Normal file
310
algo/x14/x14-4way.c
Normal file
@@ -0,0 +1,310 @@
|
||||
#include "x14-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
} x14_4way_ctx_holder;
|
||||
|
||||
x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x14_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x14_4way_ctx.blake );
|
||||
sph_bmw512_init( &x14_4way_ctx.bmw );
|
||||
init_groestl( &x14_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x14_4way_ctx.skein );
|
||||
jh512_4way_init( &x14_4way_ctx.jh );
|
||||
keccak512_4way_init( &x14_4way_ctx.keccak );
|
||||
init_luffa( &x14_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x14_4way_ctx.shavite );
|
||||
init_sd( &x14_4way_ctx.simd, 512 );
|
||||
init_echo( &x14_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x14_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x14_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x14_4way_ctx.shabal );
|
||||
};
|
||||
|
||||
void x14_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x14_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512( &ctx.hamsi, hash0, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
// 13 Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x14_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x14/x14-gate.c
Normal file
18
algo/x14/x14-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "x14-gate.h"
|
||||
|
||||
bool register_x14_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X14_4WAY)
|
||||
init_x14_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x14_4way;
|
||||
gate->hash = (void*)&x14_4way_hash;
|
||||
#else
|
||||
init_x14_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x14;
|
||||
gate->hash = (void*)&x14hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x14/x14-gate.h
Normal file
32
algo/x14/x14-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X14_GATE_H__
|
||||
#define X14_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X14_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x14_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X14_4WAY)
|
||||
|
||||
void x14_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x14_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x14hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x14_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x14-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -72,7 +72,7 @@ void init_x14_ctx()
|
||||
sph_shabal512_init(&x14_ctx.shabal);
|
||||
};
|
||||
|
||||
static void x14hash(void *output, const void *input)
|
||||
void x14hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#define hashB hash+64
|
||||
@@ -248,14 +248,3 @@ int scanhash_x14(int thr_id, struct work *work,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_x14_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_x14_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x14;
|
||||
gate->hash = (void*)&x14hash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
329
algo/x15/x15-4way.c
Normal file
329
algo/x15/x15-4way.c
Normal file
@@ -0,0 +1,329 @@
|
||||
#include "x15-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
} x15_4way_ctx_holder;
|
||||
|
||||
x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x15_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x15_4way_ctx.blake );
|
||||
sph_bmw512_init( &x15_4way_ctx.bmw );
|
||||
init_groestl( &x15_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x15_4way_ctx.skein );
|
||||
jh512_4way_init( &x15_4way_ctx.jh );
|
||||
keccak512_4way_init( &x15_4way_ctx.keccak );
|
||||
init_luffa( &x15_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x15_4way_ctx.shavite );
|
||||
init_sd( &x15_4way_ctx.simd, 512 );
|
||||
init_echo( &x15_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x15_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x15_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x15_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &x15_4way_ctx.whirlpool );
|
||||
};
|
||||
|
||||
void x15_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x15_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512( &ctx.hamsi, hash0, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
// 13 Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
// 15 Whirlpool
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x15_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
17
algo/x15/x15-gate.c
Normal file
17
algo/x15/x15-gate.c
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "x15-gate.h"
|
||||
|
||||
bool register_x15_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X15_4WAY)
|
||||
init_x15_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x15_4way;
|
||||
gate->hash = (void*)&x15_4way_hash;
|
||||
#else
|
||||
init_x15_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x15;
|
||||
gate->hash = (void*)&x15hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x15/x15-gate.h
Normal file
32
algo/x15/x15-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X15_GATE_H__
|
||||
#define X15_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X15_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x15_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X15_4WAY)
|
||||
|
||||
void x15_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x15_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x15hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x15_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x15-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -74,7 +74,7 @@ void init_x15_ctx()
|
||||
sph_whirlpool_init( &x15_ctx.whirlpool );
|
||||
};
|
||||
|
||||
static void x15hash(void *output, const void *input)
|
||||
void x15hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#define hashB hash+64
|
||||
@@ -260,13 +260,3 @@ int scanhash_x15(int thr_id, struct work *work,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_x15_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_x15_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x15;
|
||||
gate->hash = (void*)&x15hash;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
364
algo/x17/x17-4way.c
Normal file
364
algo/x17/x17-4way.c
Normal file
@@ -0,0 +1,364 @@
|
||||
#include "x17-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
SHA512_CTX sha512;
|
||||
sph_haval256_5_context haval;
|
||||
} x17_4way_ctx_holder;
|
||||
|
||||
x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_x17_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x17_4way_ctx.blake );
|
||||
sph_bmw512_init( &x17_4way_ctx.bmw );
|
||||
init_groestl( &x17_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x17_4way_ctx.skein );
|
||||
jh512_4way_init( &x17_4way_ctx.jh );
|
||||
keccak512_4way_init( &x17_4way_ctx.keccak );
|
||||
init_luffa( &x17_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x17_4way_ctx.shavite );
|
||||
init_sd( &x17_4way_ctx.simd, 512 );
|
||||
init_echo( &x17_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x17_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x17_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x17_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &x17_4way_ctx.whirlpool );
|
||||
SHA512_Init( &x17_4way_ctx.sha512 );
|
||||
sph_haval256_5_init( &x17_4way_ctx.haval );
|
||||
};
|
||||
|
||||
void x17_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
x17_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
// 8 Cubehash
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
// 12 Hamsi
|
||||
sph_hamsi512( &ctx.hamsi, hash0, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
// 13 Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
// 15 Whirlpool
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
// 16 SHA512
|
||||
SHA512_Update( &ctx.sha512, hash0, 64 );
|
||||
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash1, 64 );
|
||||
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash2, 64 );
|
||||
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash3, 64 );
|
||||
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
|
||||
|
||||
// 17 Haval
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash0, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash0 );
|
||||
memcpy( &ctx.haval, &x17_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash1, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash1 );
|
||||
memcpy( &ctx.haval, &x17_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash2, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash2 );
|
||||
memcpy( &ctx.haval, &x17_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash3, 64 );
|
||||
sph_haval256_5_close( &ctx.haval, hash3 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// big endian encode 0..18 uint32_t, 64 bits at a time
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
x17_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
17
algo/x17/x17-gate.c
Normal file
17
algo/x17/x17-gate.c
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "x17-gate.h"
|
||||
|
||||
bool register_x17_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X17_4WAY)
|
||||
init_x17_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x17_4way;
|
||||
gate->hash = (void*)&x17_4way_hash;
|
||||
#else
|
||||
init_x17_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x17;
|
||||
gate->hash = (void*)&x17_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x17/x17-gate.h
Normal file
32
algo/x17/x17-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef X17_GATE_H__
|
||||
#define X17_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define X17_4WAY
|
||||
#endif
|
||||
|
||||
bool register_x17_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X17_4WAY)
|
||||
|
||||
void x17_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x17_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void x17_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_x17_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "x17-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -86,7 +86,7 @@ void init_x17_ctx()
|
||||
sph_haval256_5_init(&x17_ctx.haval);
|
||||
};
|
||||
|
||||
static void x17hash(void *output, const void *input)
|
||||
void x17_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (64)));
|
||||
#define hashB hash+64
|
||||
@@ -248,7 +248,7 @@ int scanhash_x17(int thr_id, struct work *work,
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x17hash(hash64, endiandata);
|
||||
x17_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
@@ -281,7 +281,7 @@ int scanhash_x17(int thr_id, struct work *work,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
bool register_x17_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
@@ -290,4 +290,4 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&x17hash;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
|
556
algo/x17/xevan-4way.c
Normal file
556
algo/x17/xevan-4way.c
Normal file
@@ -0,0 +1,556 @@
|
||||
#include "xevan-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
SHA512_CTX sha512;
|
||||
sph_haval256_5_context haval;
|
||||
} xevan_4way_ctx_holder;
|
||||
|
||||
xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
|
||||
static __thread blake512_4way_context xevan_blake_4way_mid
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void init_xevan_4way_ctx()
|
||||
{
|
||||
blake512_4way_init(&xevan_4way_ctx.blake);
|
||||
sph_bmw512_init(&xevan_4way_ctx.bmw);
|
||||
init_groestl( &xevan_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init(&xevan_4way_ctx.skein);
|
||||
jh512_4way_init(&xevan_4way_ctx.jh);
|
||||
keccak512_4way_init(&xevan_4way_ctx.keccak);
|
||||
init_luffa( &xevan_4way_ctx.luffa, 512 );
|
||||
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &xevan_4way_ctx.shavite );
|
||||
init_sd( &xevan_4way_ctx.simd, 512 );
|
||||
init_echo( &xevan_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &xevan_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &xevan_4way_ctx.fugue );
|
||||
sph_shabal512_init( &xevan_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
|
||||
SHA512_Init( &xevan_4way_ctx.sha512 );
|
||||
sph_haval256_5_init( &xevan_4way_ctx.haval );
|
||||
};
|
||||
|
||||
void xevan_4way_blake512_midstate( const void* input )
|
||||
{
|
||||
memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
|
||||
sizeof(xevan_blake_4way_mid) );
|
||||
blake512_4way( &xevan_blake_4way_mid, input, 64 );
|
||||
}
|
||||
|
||||
void xevan_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[16] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[16] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
|
||||
const int dataLen = 128;
|
||||
const int midlen = 64; // bytes
|
||||
const int tail = 80 - midlen; // 16
|
||||
xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
|
||||
|
||||
memcpy( &ctx.blake, &xevan_blake_4way_mid,
|
||||
sizeof(xevan_blake_4way_mid) );
|
||||
blake512_4way( &ctx.blake, input + (midlen<<2), tail );
|
||||
blake512_4way_close(&ctx.blake, vhash);
|
||||
|
||||
memset( &vhash[8<<2], 0, 64<<2 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
|
||||
dataLen<<3 );
|
||||
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way( &ctx.jh, vhash, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way( &ctx.keccak, vhash, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
|
||||
dataLen );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, dataLen<<3 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, dataLen<<3 );
|
||||
|
||||
sph_hamsi512( &ctx.hamsi, hash0, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_shabal512( &ctx.shabal, hash0, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
SHA512_Update( &ctx.sha512, hash0, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash1, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash2, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash3, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
|
||||
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash0 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash1 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash2 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash3 );
|
||||
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
|
||||
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
|
||||
|
||||
blake512_4way( &ctx.blake, vhash, dataLen );
|
||||
blake512_4way_close(&ctx.blake, vhash);
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
|
||||
dataLen<<3 );
|
||||
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
|
||||
dataLen<<3 );
|
||||
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way( &ctx.jh, vhash, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way( &ctx.keccak, vhash, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, dataLen );
|
||||
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, dataLen );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
|
||||
dataLen );
|
||||
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
|
||||
dataLen );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, dataLen );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, dataLen<<3 );
|
||||
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, dataLen<<3 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, dataLen<<3 );
|
||||
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, dataLen<<3 );
|
||||
|
||||
sph_hamsi512( &ctx.hamsi, hash0, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash0 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash1, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash2, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash2 );
|
||||
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
|
||||
sph_hamsi512( &ctx.hamsi, hash3, dataLen );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash3 );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_shabal512( &ctx.shabal, hash0, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
sizeof(sph_whirlpool_context) );
|
||||
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||
|
||||
SHA512_Update( &ctx.sha512, hash0, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash1, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash2, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
|
||||
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
|
||||
SHA512_Update( &ctx.sha512, hash3, dataLen );
|
||||
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
|
||||
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash0 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash1 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash2 );
|
||||
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
|
||||
sizeof(sph_haval256_5_context) );
|
||||
sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
|
||||
sph_haval256_5_close( &ctx.haval, hash3 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
for ( int k=0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
xevan_4way_blake512_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
xevan_4way_hash( hash, vdata );
|
||||
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce )
|
||||
&& !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
24
algo/x17/xevan-gate.c
Normal file
24
algo/x17/xevan-gate.c
Normal file
@@ -0,0 +1,24 @@
|
||||
#include "xevan-gate.h"
|
||||
|
||||
void xevan_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_xevan_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (XEVAN_4WAY)
|
||||
init_xevan_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_xevan_4way;
|
||||
gate->hash = (void*)&xevan_4way_hash;
|
||||
#else
|
||||
init_xevan_ctx();
|
||||
gate->scanhash = (void*)&scanhash_xevan;
|
||||
gate->hash = (void*)&xevan_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->set_target = (void*)&xevan_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x17/xevan-gate.h
Normal file
32
algo/x17/xevan-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef XEVAN_GATE_H__
|
||||
#define XEVAN_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define XEVAN_4WAY
|
||||
#endif
|
||||
|
||||
bool register_xevan_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(XEVAN_4WAY)
|
||||
|
||||
void xevan_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_xevan_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void xevan_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_xevan_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "xevan-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -286,19 +286,3 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
|
||||
return 0;
|
||||
}
|
||||
|
||||
void xevan_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_xevan_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
init_xevan_ctx();
|
||||
gate->scanhash = (void*)&scanhash_xevan;
|
||||
gate->hash = (void*)&xevan_hash;
|
||||
gate->set_target = (void*)&xevan_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,935 +0,0 @@
|
||||
/*-
|
||||
* Copyright 2009 Colin Percival
|
||||
* Copyright 2013,2014 Alexander Peslyak
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* This file was originally written by Colin Percival as part of the Tarsnap
|
||||
* online backup system.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "sha256_Y.h"
|
||||
#include "sysendian.h"
|
||||
|
||||
#include "yescrypt-platform.h"
|
||||
|
||||
static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
|
||||
{
|
||||
do {
|
||||
*dest++ = *src++; *dest++ = *src++;
|
||||
*dest++ = *src++; *dest++ = *src++;
|
||||
} while (count -= 4);
|
||||
}
|
||||
|
||||
static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
|
||||
{
|
||||
do {
|
||||
*dest++ ^= *src++; *dest++ ^= *src++;
|
||||
*dest++ ^= *src++; *dest++ ^= *src++;
|
||||
} while (count -= 4);
|
||||
}
|
||||
|
||||
typedef union {
|
||||
uint32_t w[16];
|
||||
uint64_t d[8];
|
||||
} salsa20_blk_t;
|
||||
|
||||
static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
|
||||
{
|
||||
#define COMBINE(out, in1, in2) \
|
||||
Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
|
||||
COMBINE(0, 0, 2)
|
||||
COMBINE(1, 5, 7)
|
||||
COMBINE(2, 2, 4)
|
||||
COMBINE(3, 7, 1)
|
||||
COMBINE(4, 4, 6)
|
||||
COMBINE(5, 1, 3)
|
||||
COMBINE(6, 6, 0)
|
||||
COMBINE(7, 3, 5)
|
||||
#undef COMBINE
|
||||
}
|
||||
|
||||
static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
|
||||
{
|
||||
#define COMBINE(out, in1, in2) \
|
||||
Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
|
||||
Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
|
||||
COMBINE(0, 0, 6)
|
||||
COMBINE(1, 5, 3)
|
||||
COMBINE(2, 2, 0)
|
||||
COMBINE(3, 7, 5)
|
||||
COMBINE(4, 4, 2)
|
||||
COMBINE(5, 1, 7)
|
||||
COMBINE(6, 6, 4)
|
||||
COMBINE(7, 3, 1)
|
||||
#undef COMBINE
|
||||
}
|
||||
|
||||
/**
|
||||
* salsa20_8(B):
|
||||
* Apply the salsa20/8 core to the provided block.
|
||||
*/
|
||||
static void salsa20_8(uint64_t B[8])
|
||||
{
|
||||
size_t i;
|
||||
salsa20_blk_t X;
|
||||
#define x X.w
|
||||
|
||||
salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
|
||||
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
/* Operate on columns */
|
||||
x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9);
|
||||
x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18);
|
||||
|
||||
x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9);
|
||||
x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18);
|
||||
|
||||
x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9);
|
||||
x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18);
|
||||
|
||||
x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9);
|
||||
x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18);
|
||||
|
||||
/* Operate on rows */
|
||||
x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9);
|
||||
x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18);
|
||||
|
||||
x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9);
|
||||
x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18);
|
||||
|
||||
x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9);
|
||||
x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18);
|
||||
|
||||
x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9);
|
||||
x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18);
|
||||
#undef R
|
||||
}
|
||||
#undef x
|
||||
|
||||
{
|
||||
salsa20_blk_t Y;
|
||||
salsa20_simd_shuffle(&X, &Y);
|
||||
for (i = 0; i < 16; i += 4) {
|
||||
((salsa20_blk_t *)B)->w[i] += Y.w[i];
|
||||
((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
|
||||
((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
|
||||
((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* blockmix_salsa8(Bin, Bout, X, r):
|
||||
* Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r
|
||||
* bytes in length; the output Bout must also be the same size. The
|
||||
* temporary space X must be 64 bytes.
|
||||
*/
|
||||
static void
|
||||
blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/* 1: X <-- B_{2r - 1} */
|
||||
blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < 2 * r; i += 2) {
|
||||
/* 3: X <-- H(X \xor B_i) */
|
||||
blkxor(X, &Bin[i * 8], 8);
|
||||
salsa20_8(X);
|
||||
|
||||
/* 4: Y_i <-- X */
|
||||
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
|
||||
blkcpy(&Bout[i * 4], X, 8);
|
||||
|
||||
/* 3: X <-- H(X \xor B_i) */
|
||||
blkxor(X, &Bin[i * 8 + 8], 8);
|
||||
salsa20_8(X);
|
||||
|
||||
/* 4: Y_i <-- X */
|
||||
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
|
||||
blkcpy(&Bout[i * 4 + r * 8], X, 8);
|
||||
}
|
||||
}
|
||||
|
||||
/* These are tunable */
|
||||
#define S_BITS 8
|
||||
#define S_SIMD 2
|
||||
#define S_P 4
|
||||
#define S_ROUNDS 6
|
||||
|
||||
/* Number of S-boxes. Not tunable, hard-coded in a few places. */
|
||||
#define S_N 2
|
||||
|
||||
/* Derived values. Not tunable on their own. */
|
||||
#define S_SIZE1 (1 << S_BITS)
|
||||
#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
|
||||
#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
|
||||
#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
|
||||
#define S_P_SIZE (S_P * S_SIMD)
|
||||
#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
|
||||
|
||||
/**
|
||||
* pwxform(B):
|
||||
* Transform the provided block using the provided S-boxes.
|
||||
*/
|
||||
static void block_pwxform(uint64_t * B, const uint64_t * S)
|
||||
{
|
||||
uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
|
||||
const uint8_t *S0 = (const uint8_t *)S;
|
||||
const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
|
||||
size_t i, j;
|
||||
#if S_SIMD > 2
|
||||
size_t k;
|
||||
#endif
|
||||
|
||||
for (j = 0; j < S_P; j++) {
|
||||
uint64_t *Xj = X[j];
|
||||
uint64_t x0 = Xj[0];
|
||||
#if S_SIMD > 1
|
||||
uint64_t x1 = Xj[1];
|
||||
#endif
|
||||
|
||||
for (i = 0; i < S_ROUNDS; i++) {
|
||||
uint64_t x = x0 & S_MASK2;
|
||||
const uint64_t *p0, *p1;
|
||||
|
||||
p0 = (const uint64_t *)(S0 + (uint32_t)x);
|
||||
p1 = (const uint64_t *)(S1 + (x >> 32));
|
||||
|
||||
x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
|
||||
x0 += p0[0];
|
||||
x0 ^= p1[0];
|
||||
|
||||
#if S_SIMD > 1
|
||||
x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
|
||||
x1 += p0[1];
|
||||
x1 ^= p1[1];
|
||||
#endif
|
||||
|
||||
#if S_SIMD > 2
|
||||
for (k = 2; k < S_SIMD; k++) {
|
||||
x = Xj[k];
|
||||
|
||||
x = (uint64_t)(x >> 32) * (uint32_t)x;
|
||||
x += p0[k];
|
||||
x ^= p1[k];
|
||||
|
||||
Xj[k] = x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Xj[0] = x0;
|
||||
#if S_SIMD > 1
|
||||
Xj[1] = x1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* blockmix_pwxform(Bin, Bout, S, r):
|
||||
* Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must
|
||||
* be 128r bytes in length; the output Bout must also be the same size.
|
||||
*
|
||||
* S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
|
||||
* need to refer to both functions via the same function pointers.
|
||||
*/
|
||||
static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
|
||||
{
|
||||
size_t r1, r2, i;
|
||||
|
||||
/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
|
||||
r1 = r * 128 / (S_P_SIZE * 8);
|
||||
|
||||
/* X <-- B_{r1 - 1} */
|
||||
blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
|
||||
|
||||
/* X <-- X \xor B_i */
|
||||
blkxor(Bout, Bin, S_P_SIZE);
|
||||
|
||||
/* X <-- H'(X) */
|
||||
/* B'_i <-- X */
|
||||
block_pwxform(Bout, S);
|
||||
|
||||
/* for i = 0 to r1 - 1 do */
|
||||
for (i = 1; i < r1; i++) {
|
||||
/* X <-- X \xor B_i */
|
||||
blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
|
||||
S_P_SIZE);
|
||||
blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
|
||||
|
||||
/* X <-- H'(X) */
|
||||
/* B'_i <-- X */
|
||||
block_pwxform(&Bout[i * S_P_SIZE], S);
|
||||
}
|
||||
|
||||
/* Handle partial blocks */
|
||||
if (i * S_P_SIZE < r * 16)
|
||||
blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
|
||||
r * 16 - i * S_P_SIZE);
|
||||
|
||||
i = (r1 - 1) * S_P_SIZE / 8;
|
||||
/* Convert 128-byte blocks to 64-byte blocks */
|
||||
r2 = r * 2;
|
||||
|
||||
/* B'_i <-- H(B'_i) */
|
||||
salsa20_8(&Bout[i * 8]);
|
||||
i++;
|
||||
|
||||
for (; i < r2; i++) {
|
||||
/* B'_i <-- H(B'_i \xor B'_{i-1}) */
|
||||
blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
|
||||
salsa20_8(&Bout[i * 8]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* integerify(B, r):
|
||||
* Return the result of parsing B_{2r-1} as a little-endian integer.
|
||||
*/
|
||||
static __inline uint64_t
|
||||
integerify(const uint64_t * B, size_t r)
|
||||
{
|
||||
/*
|
||||
* Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
|
||||
* word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also
|
||||
* in host byte order, as it should be.
|
||||
*/
|
||||
const uint64_t * X = &B[(2 * r - 1) * 8];
|
||||
uint32_t lo = (uint32_t) X[0];
|
||||
uint32_t hi = (uint32_t) (X[6] >> 32);
|
||||
return ((uint64_t)hi << 32) + lo;
|
||||
}
|
||||
|
||||
/**
|
||||
* smix1(B, r, N, flags, V, NROM, shared, XY, S):
|
||||
* Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in
|
||||
* length; the temporary storage V must be 128rN bytes in length; the temporary
|
||||
* storage XY must be 256r + 64 bytes in length. The value N must be even and
|
||||
* no smaller than 2.
|
||||
*/
|
||||
static void
|
||||
smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
|
||||
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
|
||||
uint64_t * XY, uint64_t * S)
|
||||
{
|
||||
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
|
||||
(S ? blockmix_pwxform : blockmix_salsa8);
|
||||
const uint64_t * VROM = shared->shared1.aligned;
|
||||
uint32_t VROM_mask = shared->mask1;
|
||||
size_t s = 16 * r;
|
||||
uint64_t * X = V;
|
||||
uint64_t * Y = &XY[s];
|
||||
uint64_t * Z = S ? S : &XY[2 * s];
|
||||
uint64_t n, i, j;
|
||||
size_t k;
|
||||
|
||||
/* 1: X <-- B */
|
||||
/* 3: V_i <-- X */
|
||||
for (i = 0; i < 2 * r; i++) {
|
||||
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
|
||||
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
|
||||
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
|
||||
for (k = 0; k < 16; k++)
|
||||
tmp->w[k] = le32dec(&src->w[k]);
|
||||
salsa20_simd_shuffle(tmp, dst);
|
||||
}
|
||||
|
||||
/* 4: X <-- H(X) */
|
||||
/* 3: V_i <-- X */
|
||||
blockmix(X, Y, Z, r);
|
||||
blkcpy(&V[s], Y, s);
|
||||
|
||||
X = XY;
|
||||
|
||||
if (NROM && (VROM_mask & 1)) {
|
||||
if ((1 & VROM_mask) == 1) {
|
||||
/* j <-- Integerify(X) mod NROM */
|
||||
j = integerify(Y, r) & (NROM - 1);
|
||||
|
||||
/* X <-- H(X \xor VROM_j) */
|
||||
blkxor(Y, &VROM[j * s], s);
|
||||
}
|
||||
|
||||
blockmix(Y, X, Z, r);
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
for (n = 1, i = 2; i < N; i += 2) {
|
||||
/* 3: V_i <-- X */
|
||||
blkcpy(&V[i * s], X, s);
|
||||
|
||||
if ((i & (i - 1)) == 0)
|
||||
n <<= 1;
|
||||
|
||||
/* j <-- Wrap(Integerify(X), i) */
|
||||
j = integerify(X, r) & (n - 1);
|
||||
j += i - n;
|
||||
|
||||
/* X <-- X \xor V_j */
|
||||
blkxor(X, &V[j * s], s);
|
||||
|
||||
/* 4: X <-- H(X) */
|
||||
blockmix(X, Y, Z, r);
|
||||
|
||||
/* 3: V_i <-- X */
|
||||
blkcpy(&V[(i + 1) * s], Y, s);
|
||||
|
||||
j = integerify(Y, r);
|
||||
if (((i + 1) & VROM_mask) == 1) {
|
||||
/* j <-- Integerify(X) mod NROM */
|
||||
j &= NROM - 1;
|
||||
|
||||
/* X <-- H(X \xor VROM_j) */
|
||||
blkxor(Y, &VROM[j * s], s);
|
||||
} else {
|
||||
/* j <-- Wrap(Integerify(X), i) */
|
||||
j &= n - 1;
|
||||
j += i + 1 - n;
|
||||
|
||||
/* X <-- H(X \xor V_j) */
|
||||
blkxor(Y, &V[j * s], s);
|
||||
}
|
||||
|
||||
blockmix(Y, X, Z, r);
|
||||
}
|
||||
} else {
|
||||
yescrypt_flags_t rw = flags & YESCRYPT_RW;
|
||||
|
||||
/* 4: X <-- H(X) */
|
||||
blockmix(Y, X, Z, r);
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
for (n = 1, i = 2; i < N; i += 2) {
|
||||
/* 3: V_i <-- X */
|
||||
blkcpy(&V[i * s], X, s);
|
||||
|
||||
if (rw) {
|
||||
if ((i & (i - 1)) == 0)
|
||||
n <<= 1;
|
||||
|
||||
/* j <-- Wrap(Integerify(X), i) */
|
||||
j = integerify(X, r) & (n - 1);
|
||||
j += i - n;
|
||||
|
||||
/* X <-- X \xor V_j */
|
||||
blkxor(X, &V[j * s], s);
|
||||
}
|
||||
|
||||
/* 4: X <-- H(X) */
|
||||
blockmix(X, Y, Z, r);
|
||||
|
||||
/* 3: V_i <-- X */
|
||||
blkcpy(&V[(i + 1) * s], Y, s);
|
||||
|
||||
if (rw) {
|
||||
/* j <-- Wrap(Integerify(X), i) */
|
||||
j = integerify(Y, r) & (n - 1);
|
||||
j += (i + 1) - n;
|
||||
|
||||
/* X <-- X \xor V_j */
|
||||
blkxor(Y, &V[j * s], s);
|
||||
}
|
||||
|
||||
/* 4: X <-- H(X) */
|
||||
blockmix(Y, X, Z, r);
|
||||
}
|
||||
}
|
||||
|
||||
/* B' <-- X */
|
||||
for (i = 0; i < 2 * r; i++) {
|
||||
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
|
||||
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
|
||||
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
|
||||
for (k = 0; k < 16; k++)
|
||||
le32enc(&tmp->w[k], src->w[k]);
|
||||
salsa20_simd_unshuffle(tmp, dst);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
|
||||
* Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in
|
||||
* length; the temporary storage V must be 128rN bytes in length; the temporary
|
||||
* storage XY must be 256r + 64 bytes in length. The value N must be a
|
||||
* power of 2 greater than 1. The value Nloop must be even.
|
||||
*/
|
||||
static void
|
||||
smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
|
||||
yescrypt_flags_t flags,
|
||||
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
|
||||
uint64_t * XY, uint64_t * S)
|
||||
{
|
||||
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
|
||||
(S ? blockmix_pwxform : blockmix_salsa8);
|
||||
const uint64_t * VROM = shared->shared1.aligned;
|
||||
uint32_t VROM_mask = shared->mask1 | 1;
|
||||
size_t s = 16 * r;
|
||||
yescrypt_flags_t rw = flags & YESCRYPT_RW;
|
||||
uint64_t * X = XY;
|
||||
uint64_t * Y = &XY[s];
|
||||
uint64_t * Z = S ? S : &XY[2 * s];
|
||||
uint64_t i, j;
|
||||
size_t k;
|
||||
|
||||
if (Nloop == 0)
|
||||
return;
|
||||
|
||||
/* X <-- B' */
|
||||
for (i = 0; i < 2 * r; i++) {
|
||||
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
|
||||
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
|
||||
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
|
||||
for (k = 0; k < 16; k++)
|
||||
tmp->w[k] = le32dec(&src->w[k]);
|
||||
salsa20_simd_shuffle(tmp, dst);
|
||||
}
|
||||
|
||||
if (NROM) {
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < Nloop; i += 2) {
|
||||
/* 7: j <-- Integerify(X) mod N */
|
||||
j = integerify(X, r) & (N - 1);
|
||||
|
||||
/* 8: X <-- H(X \xor V_j) */
|
||||
blkxor(X, &V[j * s], s);
|
||||
/* V_j <-- Xprev \xor V_j */
|
||||
if (rw)
|
||||
blkcpy(&V[j * s], X, s);
|
||||
blockmix(X, Y, Z, r);
|
||||
|
||||
j = integerify(Y, r);
|
||||
if (((i + 1) & VROM_mask) == 1) {
|
||||
/* j <-- Integerify(X) mod NROM */
|
||||
j &= NROM - 1;
|
||||
|
||||
/* X <-- H(X \xor VROM_j) */
|
||||
blkxor(Y, &VROM[j * s], s);
|
||||
} else {
|
||||
/* 7: j <-- Integerify(X) mod N */
|
||||
j &= N - 1;
|
||||
|
||||
/* 8: X <-- H(X \xor V_j) */
|
||||
blkxor(Y, &V[j * s], s);
|
||||
/* V_j <-- Xprev \xor V_j */
|
||||
if (rw)
|
||||
blkcpy(&V[j * s], Y, s);
|
||||
}
|
||||
|
||||
blockmix(Y, X, Z, r);
|
||||
}
|
||||
} else {
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
i = Nloop / 2;
|
||||
do {
|
||||
/* 7: j <-- Integerify(X) mod N */
|
||||
j = integerify(X, r) & (N - 1);
|
||||
|
||||
/* 8: X <-- H(X \xor V_j) */
|
||||
blkxor(X, &V[j * s], s);
|
||||
/* V_j <-- Xprev \xor V_j */
|
||||
if (rw)
|
||||
blkcpy(&V[j * s], X, s);
|
||||
blockmix(X, Y, Z, r);
|
||||
|
||||
/* 7: j <-- Integerify(X) mod N */
|
||||
j = integerify(Y, r) & (N - 1);
|
||||
|
||||
/* 8: X <-- H(X \xor V_j) */
|
||||
blkxor(Y, &V[j * s], s);
|
||||
/* V_j <-- Xprev \xor V_j */
|
||||
if (rw)
|
||||
blkcpy(&V[j * s], Y, s);
|
||||
blockmix(Y, X, Z, r);
|
||||
} while (--i);
|
||||
}
|
||||
|
||||
/* 10: B' <-- X */
|
||||
for (i = 0; i < 2 * r; i++) {
|
||||
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
|
||||
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
|
||||
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
|
||||
for (k = 0; k < 16; k++)
|
||||
le32enc(&tmp->w[k], src->w[k]);
|
||||
salsa20_simd_unshuffle(tmp, dst);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* p2floor(x):
|
||||
* Largest power of 2 not greater than argument.
|
||||
*/
|
||||
static uint64_t
|
||||
p2floor(uint64_t x)
|
||||
{
|
||||
uint64_t y;
|
||||
while ((y = x & (x - 1)))
|
||||
x = y;
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
|
||||
* Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the
|
||||
* temporary storage V must be 128rN bytes in length; the temporary storage
|
||||
* XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
|
||||
* required with OpenMP-enabled builds). The value N must be a power of 2
|
||||
* greater than 1.
|
||||
*/
|
||||
static void
|
||||
smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
|
||||
yescrypt_flags_t flags,
|
||||
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
|
||||
uint64_t * XY, uint64_t * S)
|
||||
{
|
||||
size_t s = 16 * r;
|
||||
uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
|
||||
uint32_t i;
|
||||
|
||||
Nloop_all = Nchunk;
|
||||
if (flags & YESCRYPT_RW) {
|
||||
if (t <= 1) {
|
||||
if (t)
|
||||
Nloop_all *= 2; /* 2/3 */
|
||||
Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
|
||||
} else {
|
||||
Nloop_all *= t - 1;
|
||||
}
|
||||
} else if (t) {
|
||||
if (t == 1)
|
||||
Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
|
||||
Nloop_all *= t;
|
||||
}
|
||||
|
||||
Nloop_rw = 0;
|
||||
if (flags & __YESCRYPT_INIT_SHARED)
|
||||
Nloop_rw = Nloop_all;
|
||||
else if (flags & YESCRYPT_RW)
|
||||
Nloop_rw = Nloop_all / p;
|
||||
|
||||
Nchunk &= ~(uint64_t)1; /* round down to even */
|
||||
Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
|
||||
Nloop_rw &= ~(uint64_t)1; /* round down to even */
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
|
||||
{
|
||||
#pragma omp for
|
||||
#endif
|
||||
for (i = 0; i < p; i++) {
|
||||
uint64_t Vchunk = i * Nchunk;
|
||||
uint64_t * Bp = &B[i * s];
|
||||
uint64_t * Vp = &V[Vchunk * s];
|
||||
#ifdef _OPENMP
|
||||
uint64_t * XYp = &XY[i * (2 * s + 8)];
|
||||
#else
|
||||
uint64_t * XYp = XY;
|
||||
#endif
|
||||
uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
|
||||
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
|
||||
if (Sp)
|
||||
smix1(Bp, 1, S_SIZE_ALL / 16,
|
||||
flags & ~YESCRYPT_PWXFORM,
|
||||
Sp, NROM, shared, XYp, NULL);
|
||||
if (!(flags & __YESCRYPT_INIT_SHARED_2))
|
||||
smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
|
||||
smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
|
||||
NROM, shared, XYp, Sp);
|
||||
}
|
||||
|
||||
if (Nloop_all > Nloop_rw) {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp for
|
||||
#endif
|
||||
for (i = 0; i < p; i++) {
|
||||
uint64_t * Bp = &B[i * s];
|
||||
#ifdef _OPENMP
|
||||
uint64_t * XYp = &XY[i * (2 * s + 8)];
|
||||
#else
|
||||
uint64_t * XYp = XY;
|
||||
#endif
|
||||
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
|
||||
smix2(Bp, r, N, Nloop_all - Nloop_rw,
|
||||
flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
|
||||
}
|
||||
}
|
||||
#ifdef _OPENMP
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
|
||||
* N, r, p, t, flags, buf, buflen):
|
||||
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
|
||||
* p, buflen), or a revision of scrypt as requested by flags and shared, and
|
||||
* write the result into buf. The parameters r, p, and buflen must satisfy
|
||||
* r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power
|
||||
* of 2 greater than 1.
|
||||
*
|
||||
* t controls computation time while not affecting peak memory usage. shared
|
||||
* and flags may request special modes as described in yescrypt.h. local is
|
||||
* the thread-local data structure, allowing to preserve and reuse a memory
|
||||
* allocation across calls, thereby reducing its overhead.
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*/
|
||||
int
|
||||
yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
|
||||
const uint8_t * passwd, size_t passwdlen,
|
||||
const uint8_t * salt, size_t saltlen,
|
||||
uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
|
||||
uint8_t * buf, size_t buflen)
|
||||
{
|
||||
yescrypt_region_t tmp;
|
||||
uint64_t NROM;
|
||||
size_t B_size, V_size, XY_size, need;
|
||||
uint64_t * B, * V, * XY, * S;
|
||||
uint64_t sha256[4];
|
||||
|
||||
/*
|
||||
* YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
|
||||
* so don't let it have side-effects. Without this adjustment, it'd
|
||||
* enable the SHA-256 password pre-hashing and output post-hashing,
|
||||
* because any deviation from classic scrypt implies those.
|
||||
*/
|
||||
if (p == 1)
|
||||
flags &= ~YESCRYPT_PARALLEL_SMIX;
|
||||
|
||||
/* Sanity-check parameters */
|
||||
if (flags & ~YESCRYPT_KNOWN_FLAGS) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
#if SIZE_MAX > UINT32_MAX
|
||||
if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
|
||||
errno = EFBIG;
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
|
||||
errno = EFBIG;
|
||||
return -1;
|
||||
}
|
||||
if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
#if S_MIN_R > 1
|
||||
if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
|
||||
#if SIZE_MAX / 256 <= UINT32_MAX
|
||||
(r > SIZE_MAX / 256) ||
|
||||
#endif
|
||||
(N > SIZE_MAX / 128 / r)) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
if (N > UINT64_MAX / ((uint64_t)t + 1)) {
|
||||
errno = EFBIG;
|
||||
return -1;
|
||||
}
|
||||
#ifdef _OPENMP
|
||||
if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
|
||||
(N > SIZE_MAX / 128 / (r * p))) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
if ((flags & YESCRYPT_PWXFORM) &&
|
||||
#ifndef _OPENMP
|
||||
(flags & YESCRYPT_PARALLEL_SMIX) &&
|
||||
#endif
|
||||
p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
NROM = 0;
|
||||
if (shared->shared1.aligned) {
|
||||
NROM = shared->shared1.aligned_size / ((size_t)128 * r);
|
||||
if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
|
||||
!(flags & YESCRYPT_RW)) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate memory */
|
||||
V = NULL;
|
||||
V_size = (size_t)128 * r * N;
|
||||
#ifdef _OPENMP
|
||||
if (!(flags & YESCRYPT_PARALLEL_SMIX))
|
||||
V_size *= p;
|
||||
#endif
|
||||
need = V_size;
|
||||
if (flags & __YESCRYPT_INIT_SHARED) {
|
||||
if (local->aligned_size < need) {
|
||||
if (local->base || local->aligned ||
|
||||
local->base_size || local->aligned_size) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
if (!alloc_region(local, need))
|
||||
return -1;
|
||||
}
|
||||
V = (uint64_t *)local->aligned;
|
||||
need = 0;
|
||||
}
|
||||
B_size = (size_t)128 * r * p;
|
||||
need += B_size;
|
||||
if (need < B_size) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
XY_size = (size_t)256 * r + 64;
|
||||
#ifdef _OPENMP
|
||||
XY_size *= p;
|
||||
#endif
|
||||
need += XY_size;
|
||||
if (need < XY_size) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
if (flags & YESCRYPT_PWXFORM) {
|
||||
size_t S_size = S_SIZE_ALL * sizeof(*S);
|
||||
#ifdef _OPENMP
|
||||
S_size *= p;
|
||||
#else
|
||||
if (flags & YESCRYPT_PARALLEL_SMIX)
|
||||
S_size *= p;
|
||||
#endif
|
||||
need += S_size;
|
||||
if (need < S_size) {
|
||||
errno = ENOMEM;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (flags & __YESCRYPT_INIT_SHARED) {
|
||||
if (!alloc_region(&tmp, need))
|
||||
return -1;
|
||||
B = (uint64_t *)tmp.aligned;
|
||||
XY = (uint64_t *)((uint8_t *)B + B_size);
|
||||
} else {
|
||||
init_region(&tmp);
|
||||
if (local->aligned_size < need) {
|
||||
if (free_region(local))
|
||||
return -1;
|
||||
if (!alloc_region(local, need))
|
||||
return -1;
|
||||
}
|
||||
B = (uint64_t *)local->aligned;
|
||||
V = (uint64_t *)((uint8_t *)B + B_size);
|
||||
XY = (uint64_t *)((uint8_t *)V + V_size);
|
||||
}
|
||||
S = NULL;
|
||||
if (flags & YESCRYPT_PWXFORM)
|
||||
S = (uint64_t *)((uint8_t *)XY + XY_size);
|
||||
|
||||
if (t || flags) {
|
||||
SHA256_CTX_Y ctx;
|
||||
SHA256_Init_Y(&ctx);
|
||||
SHA256_Update_Y(&ctx, passwd, passwdlen);
|
||||
SHA256_Final_Y((uint8_t *)sha256, &ctx);
|
||||
passwd = (uint8_t *)sha256;
|
||||
passwdlen = sizeof(sha256);
|
||||
}
|
||||
|
||||
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
|
||||
PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
|
||||
(uint8_t *)B, B_size);
|
||||
|
||||
if (t || flags)
|
||||
blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
|
||||
|
||||
if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
|
||||
smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
|
||||
} else {
|
||||
uint32_t i;
|
||||
|
||||
/* 2: for i = 0 to p - 1 do */
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
|
||||
#endif
|
||||
for (i = 0; i < p; i++) {
|
||||
/* 3: B_i <-- MF(B_i, N) */
|
||||
#ifdef _OPENMP
|
||||
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
|
||||
&V[(size_t)16 * r * i * N],
|
||||
NROM, shared,
|
||||
&XY[((size_t)32 * r + 8) * i],
|
||||
S ? &S[S_SIZE_ALL * i] : S);
|
||||
#else
|
||||
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
|
||||
NROM, shared, XY, S);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
|
||||
PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
|
||||
|
||||
/*
|
||||
* Except when computing classic scrypt, allow all computation so far
|
||||
* to be performed on the client. The final steps below match those of
|
||||
* SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
|
||||
* far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
|
||||
* SCRAM's use of SHA-1) would be usable with yescrypt hashes.
|
||||
*/
|
||||
if ((t || flags) && buflen == sizeof(sha256)) {
|
||||
/* Compute ClientKey */
|
||||
{
|
||||
HMAC_SHA256_CTX ctx;
|
||||
HMAC_SHA256_Init(&ctx, buf, buflen);
|
||||
HMAC_SHA256_Update(&ctx, salt, saltlen);
|
||||
HMAC_SHA256_Final((uint8_t *)sha256, &ctx);
|
||||
}
|
||||
/* Compute StoredKey */
|
||||
{
|
||||
SHA256_CTX_Y ctx;
|
||||
SHA256_Init_Y(&ctx);
|
||||
SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
|
||||
SHA256_Final_Y(buf, &ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (free_region(&tmp))
|
||||
return -1;
|
||||
|
||||
/* Success! */
|
||||
return 0;
|
||||
}
|
@@ -426,7 +426,7 @@ int64_t yescryptr16_get_max64()
|
||||
|
||||
bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yescrypt;
|
||||
gate->hash = (void*)&yescrypt_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yescrypt;
|
||||
gate->hash = (void*)&yescrypt_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
|
708
avxdefs.h
708
avxdefs.h
@@ -1,71 +1,96 @@
|
||||
#ifndef AVXDEFS_H__
|
||||
#define AVXDEFS_H__
|
||||
|
||||
// Some tools to help using AVX and AVX2
|
||||
// At this time SSE2 is sufficient for all 128 bit code in this file.
|
||||
// Some tools to help using AVX and AVX2.
|
||||
// At this time SSE2 is sufficient for all 128 bit code in this file
|
||||
// but could change without notice.
|
||||
// 256 bit requires AVX2.
|
||||
// AVX512 has more powerful 256 bit instructions but with AVX512 available
|
||||
// there is little reason to use them.
|
||||
// Proper alignment of data is required, 16 bytes for 128 bit vectors and
|
||||
// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
|
||||
// best cache alignment.
|
||||
//
|
||||
// There exist dupplicates of some functions. In general the first defined
|
||||
// is preferred as it is more efficient but also more restrictive and may
|
||||
// not be applicable. The less efficient versions are more flexible.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
#include <memory.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
//
|
||||
// 128 bit utilities and shortcuts
|
||||
|
||||
//
|
||||
// Pseudo constants, there are no real vector constants.
|
||||
// These can't be used for compile time initialization.
|
||||
|
||||
// Constant zero
|
||||
#define mm_zero _mm_setzero_si128()
|
||||
#define mm_zero _mm_setzero_si128()
|
||||
|
||||
// Constant 1
|
||||
#define mm_one_128 _mm_set_epi64x( 0ULL, 1ULL )
|
||||
#define mm_one_64 _mm_set1_epi64x( 1ULL )
|
||||
#define mm_one_32 _mm_set1_epi32( 1UL )
|
||||
#define mm_one_16 _mm_set1_epi16( 1U )
|
||||
|
||||
// Constant minus 1
|
||||
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFF )
|
||||
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFUL )
|
||||
|
||||
//
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
|
||||
// Bitwise not (~x)
|
||||
#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 )
|
||||
#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 )
|
||||
|
||||
// Unary negation (-a)
|
||||
#define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a )
|
||||
#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )
|
||||
#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a )
|
||||
|
||||
//
|
||||
// Bit operations, functional but not very efficient
|
||||
// Bit operations
|
||||
|
||||
// Return x with bit n set/clear in all elements
|
||||
#define mm_bitset_128( x, n ) \
|
||||
_mm_or_si128( _mm_slli_si128( _mm_set_epi64x( 0ULL, 1ULL ), n ) )
|
||||
|
||||
#define mm_bitclr_128( x, n ) \
|
||||
_mm_and_si128( x, mm_not( _mm_slli_si128( \
|
||||
_mm_set_epi64x( 0ULL, 1ULL ), n ) ) )
|
||||
|
||||
#define mm_bitset_64( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) )
|
||||
|
||||
#define mm_bitclr_64( x, n ) \
|
||||
_mm_and_si128( x, mm_not( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) ) )
|
||||
|
||||
#define mm_bitset_32( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) )
|
||||
|
||||
#define mm_bitclr_32( x, n ) \
|
||||
_mm_and_si128( x, mm_not( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) ) )
|
||||
|
||||
#define mm_bitset_16( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) )
|
||||
|
||||
#define mm_bitclr_16( x, n ) \
|
||||
_mm_and_si128( x, mm_not( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) ) )
|
||||
|
||||
// return vector of bool
|
||||
#define mm_bittest_128( x, n ) \
|
||||
_mm_and_si256( _mm_srli_si128( x, n ), _mm_set_epi64x( 0ULL, 1ULL ) )
|
||||
// Return bit n in position, all other bits zeroed.
|
||||
#define mm_bitextract_64 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
|
||||
#define mm_bitextract_32 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi32( 1UL << (n) ), x )
|
||||
#define mm_bitextract_16 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi16( 1U << (n) ), x )
|
||||
|
||||
// Return bit n as bool
|
||||
#define mm_bittest_64( x, n ) \
|
||||
_mm_and_si256( _mm_srli_epi64( x, n ), _mm_set1_epi64x( 1ULL ) )
|
||||
|
||||
_mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) )
|
||||
#define mm_bittest_32( x, n ) \
|
||||
_mm_and_si256( _mm_srli_epi32( x, n ), _mm_set1_epi32( 1UL ) )
|
||||
|
||||
_mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) )
|
||||
#define mm_bittest_16( x, n ) \
|
||||
_mm_and_si256( _mm_srli_epi16( x, n ), _mm_set1_epi16( 1U ) )
|
||||
_mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) )
|
||||
|
||||
// Return x with bit n set/cleared in all elements
|
||||
#define mm_bitset_64( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x )
|
||||
#define mm_bitclr_64( x, n ) \
|
||||
_mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x )
|
||||
#define mm_bitset_32( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x )
|
||||
#define mm_bitclr_32( x, n ) \
|
||||
_mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x )
|
||||
#define mm_bitset_16( x, n ) \
|
||||
_mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x )
|
||||
#define mm_bitclr_16( x, n ) \
|
||||
_mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x )
|
||||
|
||||
// Return x with bit n toggled
|
||||
#define mm_bitflip_64( x, n ) \
|
||||
_mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x )
|
||||
#define mm_bitflip_32( x, n ) \
|
||||
_mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x )
|
||||
#define mm_bitflip_16( x, n ) \
|
||||
_mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x )
|
||||
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -86,13 +111,33 @@ inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
// Scalar 64 bit copy, n = bytes/8
|
||||
inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
|
||||
// Compare data in memory, return true if different
|
||||
inline bool memcmp_128( __m128i src1, __m128i src2, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ )
|
||||
dst[i] = src[i];
|
||||
if ( src1[i] != src2[i] ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// A couple of 64 bit scalar functions
|
||||
// n = bytes/8
|
||||
|
||||
inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
inline void memset_zero_64( uint64_t *src, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) src[i] = 0;
|
||||
}
|
||||
|
||||
inline void memset_64( uint64_t *dst, uint64_t a, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ ) dst[i] = a;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Pointer cast
|
||||
|
||||
@@ -108,149 +153,136 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
|
||||
// returns p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// XOP is an obsolete AMD feature that has native rotation.
|
||||
// _mm_roti_epi64( w, c)
|
||||
// Never implemented by Intel and since removed from Zen by AMD.
|
||||
|
||||
// Rotate bits in vector elements
|
||||
|
||||
#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
|
||||
_mm_slli_epi64( w, 64-c ) )
|
||||
|
||||
_mm_slli_epi64( w, 64-(c) ) )
|
||||
#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
|
||||
_mm_srli_epi64( w, 64-c ) )
|
||||
|
||||
_mm_srli_epi64( w, 64-(c) ) )
|
||||
#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
|
||||
_mm_slli_epi32( w, 32-c ) )
|
||||
|
||||
_mm_slli_epi32( w, 32-(c) ) )
|
||||
#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
|
||||
_mm_srli_epi32( w, 32-c ) )
|
||||
|
||||
_mm_srli_epi32( w, 32-(c) ) )
|
||||
#define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \
|
||||
_mm_slli_epi16( w, 16-c ) )
|
||||
|
||||
_mm_slli_epi16( w, 16-(c) ) )
|
||||
#define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \
|
||||
_mm_srli_epi16( w, 16-c ) )
|
||||
_mm_srli_epi16( w, 16-(c) ) )
|
||||
|
||||
//
|
||||
// Shuffle vector elements
|
||||
// Rotate elements in vector
|
||||
|
||||
// Swap upper and lower 64 bits of 128 bit source vector
|
||||
#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
|
||||
// Optimized shuffle
|
||||
|
||||
// Rotate 128 vector by 1 32 bit element.
|
||||
// Swap hi/lo 64 bits in 128 bit vector
|
||||
#define mm_swap_64( w ) _mm_shuffle_epi32( w, 0x4e )
|
||||
|
||||
// rotate 128 bit vector by 32 bits
|
||||
#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 )
|
||||
#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 )
|
||||
|
||||
// Shuffle elements across two 128 bit vectors
|
||||
// Swap hi/lo 32 bits in each 64 bit element
|
||||
#define mm_swap64_32( x ) _mm_shuffle_epi32( x, 0xb1 )
|
||||
|
||||
// Swap 128 bit source vectors in place.
|
||||
// Less efficient but more versatile. Use only for odd number rotations.
|
||||
// Use shuffle above when possible.
|
||||
|
||||
// Rotate vector by n bytes.
|
||||
#define mm_rotr128_x8( w, n ) \
|
||||
_mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) )
|
||||
#define mm_rotl128_x8( w, n ) \
|
||||
_mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) )
|
||||
|
||||
// Rotate vector by c elements, use only for odd number rotations
|
||||
#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 )
|
||||
#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 )
|
||||
#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 )
|
||||
#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 )
|
||||
|
||||
//
|
||||
// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo}
|
||||
|
||||
// Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits.
|
||||
// void mm128_swap128( __m128i, __m128i )
|
||||
#define mm_swap_128(hi, lo) hi = _mm_xor_si128(hi, lo); \
|
||||
lo = _mm_xor_si128(hi, lo); \
|
||||
hi = _mm_xor_si128(hi, lo);
|
||||
|
||||
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
|
||||
#define mm_rotl256_1x64( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x32( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x32( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
// Older slower
|
||||
#define mm_rotl256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ); \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_or_si128( \
|
||||
_mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
|
||||
s1 = _mm_or_si128( \
|
||||
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64x( s0, s1 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
s0 = mm_swap_64( s0 ) ; \
|
||||
s1 = mm_swap_64( s1 ); \
|
||||
t = _mm_or_si128( \
|
||||
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
|
||||
s1 = _mm_or_si128( \
|
||||
_mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
|
||||
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
|
||||
s0 = t; \
|
||||
} while(0)
|
||||
|
||||
// need a better name, not rot, poke? step?
|
||||
// Return s0 with elements shifted right/left and low/high element from
|
||||
// s1 shifted into the vacated high/low element of s0.
|
||||
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
|
||||
// and return the rotated s0.
|
||||
// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
|
||||
// completed. It's faster than a full rotation.
|
||||
|
||||
inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
|
||||
{
|
||||
return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
|
||||
_mm_slli_si128( s1, 16 - (n<<2) ) );
|
||||
#define mm_swap_128(hi, lo) \
|
||||
{ \
|
||||
hi = _mm_xor_si128(hi, lo); \
|
||||
lo = _mm_xor_si128(hi, lo); \
|
||||
hi = _mm_xor_si128(hi, lo); \
|
||||
}
|
||||
|
||||
inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
|
||||
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
|
||||
#define mm_rotl256_1x64( hi, lo ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
hi = mm_swap_64( hi ); \
|
||||
lo = mm_swap_64( lo ); \
|
||||
t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
hi = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64( hi, lo ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
hi = mm_swap_64( hi ); \
|
||||
lo = mm_swap_64( lo ); \
|
||||
t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
|
||||
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
|
||||
hi = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x32( hi, lo ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
hi = mm_swap_64( hi ); \
|
||||
lo = mm_swap_64( lo ); \
|
||||
t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
hi = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x32( hi, lo ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
hi = mm_swap_64( hi ); \
|
||||
lo = mm_swap_64( lo ); \
|
||||
t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
hi = t; \
|
||||
} while(0)
|
||||
|
||||
// Return hi 128 bits with elements shifted one lane with vacated lane filled
|
||||
// with data rotated from lo.
|
||||
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
|
||||
// and return the rotated high 128 bits.
|
||||
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
|
||||
// completed. It's faster than a full rotation.
|
||||
|
||||
inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_si128( s0, n<<2 ),
|
||||
_mm_srli_si128( s1, 16 - (n<<2) ) );
|
||||
return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
|
||||
_mm_slli_si128( lo, 16 - (n<<2) ) );
|
||||
}
|
||||
|
||||
inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_si128( hi, n<<2 ),
|
||||
_mm_srli_si128( lo, 16 - (n<<2) ) );
|
||||
}
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements
|
||||
|
||||
inline __m128i mm_byteswap_32( __m128i x )
|
||||
{
|
||||
return _mm_shuffle_epi8( x, _mm_set_epi8(
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
}
|
||||
|
||||
inline __m128i mm_byteswap_64( __m128i x )
|
||||
{
|
||||
return _mm_shuffle_epi8( x, _mm_set_epi8(
|
||||
@@ -258,96 +290,95 @@ inline __m128i mm_byteswap_64( __m128i x )
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
||||
}
|
||||
|
||||
// older slower
|
||||
inline __m128i mm_byteswap_32x( __m128i x )
|
||||
inline __m128i mm_byteswap_32( __m128i x )
|
||||
{
|
||||
__m128i x1 = _mm_and_si128( x, _mm_set1_epi32( 0x0000ff00 ) );
|
||||
__m128i x2 = _mm_and_si128( x, _mm_set1_epi32( 0x00ff0000 ) );
|
||||
__m128i x0 = _mm_slli_epi32( x, 24 ); // x0 = x << 24
|
||||
x1 = _mm_slli_epi32( x1, 8 ); // x1 = mask(x) << 8
|
||||
x2 = _mm_srli_epi32( x2, 8 ); // x2 = mask(x) >> 8
|
||||
__m128i x3 = _mm_srli_epi32( x, 24 ); // x3 = x >> 24
|
||||
return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
|
||||
return _mm_shuffle_epi8( x, _mm_set_epi8(
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
}
|
||||
|
||||
inline __m128i mm_byteswap_64x( __m128i x )
|
||||
inline __m128i mm_byteswap_16( __m128i x )
|
||||
{
|
||||
x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
|
||||
|
||||
x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
|
||||
_mm_slli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
|
||||
|
||||
return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
|
||||
_mm_slli_epi64( _mm_and_si128( x,
|
||||
_mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
|
||||
return _mm_shuffle_epi8( x, _mm_set_epi8(
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
//
|
||||
// 256 bit utilities and Shortcuts
|
||||
|
||||
//
|
||||
// Pseudo constants, there are no real vector constants.
|
||||
// These can't be used for compile time initialization
|
||||
|
||||
// Constant zero
|
||||
#define mm256_zero _mm256_setzero_si256()
|
||||
|
||||
// Constant 1
|
||||
#define mm256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define mm256_one_64 _mm256_set1_epi64x( 1ULL )
|
||||
#define mm256_one_32 _mm256_set1_epi32( 1UL )
|
||||
#define mm256_one_16 _mm256_set1_epi16( 1U )
|
||||
|
||||
// Constant minus 1
|
||||
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFF )
|
||||
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
// Bitwise not ( ~x )
|
||||
#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
|
||||
#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
|
||||
|
||||
// Unary negation ( -a )
|
||||
#define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a )
|
||||
#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )
|
||||
#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a )
|
||||
|
||||
//
|
||||
// Bit operations
|
||||
|
||||
// Return x with bit n set/clear in all elements
|
||||
#define mm256_bitset_128( x, n ) \
|
||||
_mm256_or_si256( _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) )
|
||||
|
||||
#define mm256_bitclr_128( x, n ) \
|
||||
_mm256_and_si256( x, mm256_not( \
|
||||
_mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) ) )
|
||||
|
||||
#define mm256_bitset_64( x, n ) \
|
||||
_mm256_or_si256( x, _mm256_set1_epi64x( 1ULL << n ) )
|
||||
|
||||
#define mm256_bitclr_64( x, n ) \
|
||||
_mm256_and_si256( x, mm256_not( _mm256_set1_epi64x( 1ULL << n ) ) )
|
||||
|
||||
#define mm256_bitset_32( x, n ) \
|
||||
_mm256_or_si256( x, _mm256_set1_epi32( 1UL << n ) )
|
||||
|
||||
#define mm256_bitclr_32( x, n ) \
|
||||
_mm256_and_si256( x, mm256_not( _mm256_set1_epi32( 1UL << n ) ) )
|
||||
|
||||
#define mm256_bitset_16( x, n ) \
|
||||
_mm256_or_si256( x, _mm256_set1_epi16( 1U << n ) )
|
||||
|
||||
#define mm256_bitclr_16( x, n ) \
|
||||
_mm256_and_si256( x, mm256_not( _mm256_set1_epi16( 1U << n ) ) )
|
||||
|
||||
// return vector of bool
|
||||
#define mm256_bittest_128( x, n ) \
|
||||
_mm256_and_si256( _mm256_srli_si256( x, n ), \
|
||||
_mm256_set_m128i( _mm_set_epi64x( 0ULL, 1ULL ) ) )
|
||||
// return bit n in position, all othr bits cleared
|
||||
#define mm256_bitextract_64 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
|
||||
#define mm256_bitextract_32 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi32( 0UL << (n) ), x )
|
||||
#define mm256_bitextract_16 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi16( 0U << (n) ), x )
|
||||
|
||||
// Return bit n as bool (bit 0)
|
||||
#define mm256_bittest_64( x, n ) \
|
||||
_mm256_and_si256( _mm256_srli_epi64( x, n ), \
|
||||
_mm256_set1_epi64x( 1ULL << n ) )
|
||||
|
||||
_mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) )
|
||||
#define mm256_bittest_32( x, n ) \
|
||||
_mm256_and_si256( _mm256_srli_epi32( x, n ), \
|
||||
_mm256_set1_epi32( 1UL << n ) )
|
||||
|
||||
_mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) )
|
||||
#define mm256_bittest_16( x, n ) \
|
||||
_mm256_and_si256( _mm256_srli_epi16( x, n ), \
|
||||
_mm256_set1_epi16( 1U << n ) )
|
||||
_mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) )
|
||||
|
||||
// Return x with bit n set/cleared in all elements
|
||||
#define mm256_bitset_64( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
|
||||
#define mm256_bitclr_64( x, n ) \
|
||||
_mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
|
||||
#define mm256_bitset_32( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
|
||||
#define mm256_bitclr_32( x, n ) \
|
||||
_mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
|
||||
#define mm256_bitset_16( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
|
||||
#define mm256_bitclr_16( x, n ) \
|
||||
_mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
|
||||
|
||||
// Return x with bit n toggled
|
||||
#define mm256_bitflip_64( x, n ) \
|
||||
_mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
|
||||
#define mm256_bitflip_32( x, n ) \
|
||||
_mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
|
||||
#define mm256_bitflip_16( x, n ) \
|
||||
_mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
|
||||
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -368,6 +399,14 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
|
||||
}
|
||||
|
||||
// Compare data in memory, return true if different
|
||||
inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
||||
{
|
||||
for ( int i = 0; i < n; i++ )
|
||||
if ( src1[i] != src2[i] ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
@@ -383,39 +422,128 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
// returns p[i]
|
||||
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
//
|
||||
// Rotate bits in vector elements
|
||||
// w = packed data, c = number of bits to rotate
|
||||
|
||||
// Rotate bits in 64 bit elements
|
||||
// w = packed 64 bit data, c = number of bits to rotate
|
||||
#define mm256_rotr_64( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
|
||||
|
||||
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) )
|
||||
#define mm256_rotl_64( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
|
||||
|
||||
// Rotate bits in 32 bit elements
|
||||
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) )
|
||||
#define mm256_rotr_32( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
|
||||
|
||||
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) )
|
||||
#define mm256_rotl_32( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
|
||||
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) )
|
||||
#define mm256_rotr_16( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) )
|
||||
#define mm256_rotl_16( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) )
|
||||
|
||||
//
|
||||
// Rotate elements in vector
|
||||
// There is no full vector permute for elements less than 64 bits or 256 bit
|
||||
// shift, a little more work is needed.
|
||||
|
||||
// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
|
||||
// Identical functionality but "f" is AVX and "x" iis AVX2, likely faster.
|
||||
#define mm256_swap_128( w ) _mm256_permute2x128_si256( w, w, 1 )
|
||||
//#define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 )
|
||||
// Optimized 64 bit permutations
|
||||
// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
|
||||
#define mm256_swap_128( w ) _mm256_permute4x64_epi64( w, 0x4e )
|
||||
//#define mm256_swap_128( w ) _mm256_permute2x128_si256( w, w, 1 )
|
||||
|
||||
// Rotate vector by one 64 bit element (aka two 32 bit elements)
|
||||
//__m256i mm256_rotl256_1x64( _mm256i, int )
|
||||
// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
|
||||
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
|
||||
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
|
||||
|
||||
// Rotate by one 32 bit element (aka two 16 bit elements)
|
||||
#define mm256_rotl256_1x32( w ) _mm256_shuffle_epi32( w, 0x93 )
|
||||
#define mm256_rotr256_1x32( w ) _mm256_shuffle_epi32( w, 0x39 )
|
||||
// Swap hi/lo 64 bits in each 128 bit element
|
||||
#define mm256_swap128_64( x ) _mm256_shuffle_epi32( x, 0x4e )
|
||||
|
||||
// Rotate 128 bit elements by 32 bits
|
||||
#define mm256_rotr128_1x32( x ) _mm256_shuffle_epi32( x, 0x39 )
|
||||
#define mm256_rotl128_1x32( x ) _mm256_shuffle_epi32( x, 0x93 )
|
||||
|
||||
// Swap hi/lo 32 bits in each 64 bit element
|
||||
#define mm256_swap64_32( x ) _mm256_shuffle_epi32( x, 0xb1 )
|
||||
|
||||
// Less efficient but more versatile. Use only for rotations that are not
|
||||
// integrals of 64 bits. Use permutations above when possible.
|
||||
|
||||
// Rotate 256 bit vector by c bytes.
|
||||
#define mm256_rotr256_x8( w, c ) \
|
||||
_mm256_or_si256( _mm256_srli_si256( w, c ), \
|
||||
mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) )
|
||||
#define mm256_rotl256_x8( w, c ) \
|
||||
_mm256_or_si256( _mm256_slli_si256( w, c ), \
|
||||
mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) )
|
||||
|
||||
// Rotate 256 bit vector by c elements, use only for odd value rotations
|
||||
#define mm256_rotr256_x32( w, c ) mm256_rotr256_x8( w, (c)>>2 )
|
||||
#define mm256_rotl256_x32( w, c ) mm256_rotl256_x8( w, (c)>>2 )
|
||||
#define mm256_rotr256_x16( w, c ) mm256_rotr256_x8( w, (c)>>1 )
|
||||
#define mm256_rotl256_x16( w, c ) mm256_rotl256_x8( w, (c)>>1 )
|
||||
|
||||
//
|
||||
// Rotate two 256 bit vectors as one 512 bit vector
|
||||
|
||||
// Fast but limited to 128 bit granularity
|
||||
#define mm256_swap512_256(a, b) _mm256_permute2x128_si256( a, b, 0x1032 )
|
||||
#define mm256_rotr512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x0321 )
|
||||
#define mm256_rotl512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x2103 )
|
||||
|
||||
// Much slower, for 64 and 32 bit granularity
|
||||
#define mm256_rotr512_1x64(a, b) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \
|
||||
b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_1x64(a, b) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \
|
||||
b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotr512_1x32(a, b) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \
|
||||
b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_1x32(a, b) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \
|
||||
b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
// Byte granularity but even a bit slower
|
||||
#define mm256_rotr512_x8( a, b, n ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \
|
||||
_mm256_slli_epi64( b, ( 32 - (n) ) ) ); \
|
||||
b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \
|
||||
_mm256_slli_epi64( a, ( 32 - (n) ) ) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_x8( a, b, n ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \
|
||||
_mm256_srli_epi64( b, ( 32 - (n) ) ) ); \
|
||||
b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \
|
||||
_mm256_srli_epi64( a, ( 32 - (n) ) ) ); \
|
||||
a = t; \
|
||||
while (0);
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements
|
||||
@@ -438,47 +566,30 @@ inline __m256i mm256_byteswap_32( __m256i x )
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
}
|
||||
|
||||
// older, slower
|
||||
inline __m256i mm256_byteswap_32x( __m256i x )
|
||||
inline __m256i mm256_byteswap_16( __m256i x )
|
||||
{
|
||||
__m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
|
||||
__m256i x2 = _mm256_and_si256( x, _mm256_set1_epi32( 0x00ff0000 ) );
|
||||
__m256i x0 = _mm256_slli_epi32( x, 24 ); // x0 = x << 24
|
||||
x1 = _mm256_slli_epi32( x1, 8 ); // x1 = mask1(x) << 8
|
||||
x2 = _mm256_srli_epi32( x2, 8 ); // x2 = mask2(x) >> 8
|
||||
__m256i x3 = _mm256_srli_epi32( x, 24 ); // x3 = x >> 24
|
||||
return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
|
||||
_mm256_or_si256( x2, x3 ) );
|
||||
}
|
||||
|
||||
inline __m256i mm256_byteswap_64x( __m256i x )
|
||||
{
|
||||
x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
|
||||
|
||||
x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
|
||||
_mm256_slli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
|
||||
|
||||
return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
|
||||
_mm256_slli_epi64( _mm256_and_si256( x,
|
||||
_mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
|
||||
return _mm256_shuffle_epi8( x, _mm256_set_epi8(
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
}
|
||||
|
||||
|
||||
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
|
||||
// usefulness tbd
|
||||
// __m128i hi, __m128i lo, returns __m256i
|
||||
#define mm256_pack_2x128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \
|
||||
|
||||
// __m128i hi, __m128i lo, __m256i src
|
||||
#define mm256_unpack_2x128( hi, lo, src ) \
|
||||
lo = _mm256_castsi256_si128( src ); \
|
||||
hi = _mm256_castsi256_si128( mm256_swap_128( src ) );
|
||||
// hi = _mm256_extracti128_si256( src, 1 );
|
||||
|
||||
// Pseudo parallel AES
|
||||
// Probably noticeably slower than using pure 128 bit vectors
|
||||
// More efficient if one key for both lanes.
|
||||
inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
|
||||
{
|
||||
__m128i hi, lo, khi, klo;
|
||||
@@ -487,7 +598,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
|
||||
mm256_unpack_2x128( khi, klo, k );
|
||||
lo = _mm_aesenc_si128( lo, klo );
|
||||
hi = _mm_aesenc_si128( hi, khi );
|
||||
|
||||
return mm256_pack_2x128( hi, lo );
|
||||
}
|
||||
|
||||
@@ -498,7 +608,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
|
||||
mm256_unpack_2x128( hi, lo, x );
|
||||
lo = _mm_aesenc_si128( lo, mm_zero );
|
||||
hi = _mm_aesenc_si128( hi, mm_zero );
|
||||
|
||||
return mm256_pack_2x128( hi, lo );
|
||||
}
|
||||
|
||||
@@ -533,8 +642,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
|
||||
|
||||
// interleave 4 arrays of 32 bit elements for 128 bit processing
|
||||
// bit_len must be 256, 512 or 640 bits.
|
||||
// Vector indexing doesn't work with 32 bit data.
|
||||
// There's no vector indexing here!!!
|
||||
inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
|
||||
const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
@@ -591,8 +698,6 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void *src1,
|
||||
}
|
||||
}
|
||||
|
||||
// doesn't work with 32 bit elements
|
||||
// no vector indexing here?
|
||||
inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
@@ -632,7 +737,6 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
|
||||
}
|
||||
|
||||
|
||||
// deinterleave 4 arrays into individual buffers for scalarm processing
|
||||
// bit_len must be multiple of 32
|
||||
inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
|
||||
@@ -656,7 +760,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Interleave 4 source buffers containing 64 bit data into the destination
|
||||
// buffer
|
||||
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
|
||||
inline void mm256_interleave_4x64( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3, int bit_len )
|
||||
{
|
||||
@@ -682,6 +786,17 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
|
||||
|
||||
d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
|
||||
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
|
||||
d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
|
||||
|
||||
d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
|
||||
d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
|
||||
d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
|
||||
d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
@@ -705,7 +820,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
|
||||
}
|
||||
|
||||
// Deinterleave 4 buffers of 64 bit data from the source buffer.
|
||||
// bit_len must be 256, 512 or 640 bits.
|
||||
// bit_len must be 256, 512, 640 or 1024 bits.
|
||||
// Requires overrun padding for 640 bit len.
|
||||
inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
@@ -730,11 +845,26 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
// null change to overrun area
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
|
||||
if ( bit_len <= 640 )
|
||||
{
|
||||
// null change to overrun area
|
||||
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
|
||||
return;
|
||||
}
|
||||
|
||||
d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
|
||||
d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
|
||||
d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
|
||||
d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
|
||||
|
||||
d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
|
||||
d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
|
||||
d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
|
||||
d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// Slower version
|
||||
@@ -785,9 +915,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
s3[4], s2[4], s1[4], s0[4] );
|
||||
d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
|
||||
s3[5], s2[5], s1[5], s0[5] );
|
||||
d [6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
|
||||
d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
|
||||
s3[6], s2[6], s1[6], s0[6] );
|
||||
d [7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
|
||||
d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
|
||||
s3[7], s2[7], s1[7], s0[7] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
@@ -904,22 +1034,22 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[153], s[145], s[137], s[129] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d2) + 8;
|
||||
d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[154], s[146], s[138], s[130]);
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d3) + 8;
|
||||
d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[155], s[147], s[139], s[131] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d4) + 8;
|
||||
d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[156], s[148], s[140], s[132] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d5) + 8;
|
||||
d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[157], s[149], s[141], s[133] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d6) + 8;
|
||||
d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[158], s[150], s[142], s[134] );
|
||||
d = ((uint32_t*)d1) + 8;
|
||||
d = ((uint32_t*)d7) + 8;
|
||||
d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
|
||||
s[159], s[151], s[143], s[135] );
|
||||
}
|
||||
|
@@ -1,10 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
#if [ "$OS" = "Windows_NT" ]; then
|
||||
# ./mingw64.sh
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
# Linux build
|
||||
|
||||
make distclean || echo clean
|
||||
@@ -12,14 +7,8 @@ make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
# Ubuntu 10.04 (gcc 4.4)
|
||||
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
|
||||
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
|
||||
|
||||
make -j 4
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-4way.exe
|
||||
@@ -13,7 +13,7 @@ mv cpuminer cpuminer-4way
|
||||
make clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-avx2.exe
|
||||
@@ -23,7 +23,7 @@ mv cpuminer cpuminer-aes-avx2
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-avx.exe
|
||||
@@ -33,7 +33,7 @@ mv cpuminer cpuminer-aes-avx
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-sse42.exe
|
||||
@@ -43,7 +43,7 @@ mv cpuminer cpuminer-aes-sse42
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-sse42.exe
|
||||
@@ -53,7 +53,7 @@ mv cpuminer cpuminer-sse42
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-sse2.exe
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.7.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.7.7'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.7'
|
||||
PACKAGE_VERSION='3.7.8'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.8'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.7.7 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.7:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.7.7
|
||||
cpuminer-opt configure 3.7.8
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.7.7, which was
|
||||
It was created by cpuminer-opt $as_me 3.7.8, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.7.7'
|
||||
VERSION='3.7.8'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.7.7, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.7.8, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.7.7
|
||||
cpuminer-opt config.status 3.7.8
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.7.7])
|
||||
AC_INIT([cpuminer-opt], [3.7.8])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
4
miner.h
4
miner.h
@@ -358,8 +358,8 @@ struct work {
|
||||
char *job_id;
|
||||
size_t xnonce2_len;
|
||||
unsigned char *xnonce2;
|
||||
uint32_t nonces[4];
|
||||
bool nfound[4];
|
||||
uint32_t nonces[8];
|
||||
bool nfound[8];
|
||||
};
|
||||
|
||||
struct stratum_job {
|
||||
|
82
winbuild-cross.sh
Executable file
82
winbuild-cross.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
|
||||
LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
F="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
mkdir release
|
||||
cp README.txt release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-4way-sha.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
|
||||
make
|
||||
mv cpuminer.exe release/cpuminer-4way.exe
|
||||
|
||||
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx-sha.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-avx2.exe
|
||||
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=znver1 -Wall" ./configure $F
|
||||
#make -j
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-aes-sha.exe
|
||||
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-avx.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7 -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-sse42.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core2 -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-sse2.exe
|
||||
make clean || echo clean
|
||||
|
Reference in New Issue
Block a user