This commit is contained in:
Jay D Dee
2018-01-16 15:11:44 -05:00
parent bee78eac76
commit a90d75b8f5
77 changed files with 3408 additions and 1214 deletions

View File

@@ -46,8 +46,10 @@ cpuminer_SOURCES = \
algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \
algo/blake/blake2s.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \
algo/blake/blakecoin-4way.c \
algo/blake/decred-gate.c \
algo/blake/decred.c \
algo/blake/decred-4way.c \
@@ -99,13 +101,17 @@ cpuminer_SOURCES = \
algo/luffa/sse2/luffa_for_sse2.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2rev2-4way.c \
algo/lyra2/lyra2re.c \
algo/lyra2/lyra2z-gate.c \
algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
@@ -113,7 +119,9 @@ cpuminer_SOURCES = \
algo/nist5/nist5.c \
algo/nist5/zr5.c \
algo/pluck.c \
algo/quark/quark-gate.c \
algo/quark/quark.c \
algo/quark/quark-4way.c \
algo/qubit/qubit.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
@@ -140,9 +148,8 @@ cpuminer_SOURCES = \
algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \
algo/sm3/sm3.c \
algo/sm3/sm3-hash-4way.c \
algo/tiger/sph_tiger.c \
algo/timetravel.c \
algo/timetravel10.c \
algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +168,16 @@ cpuminer_SOURCES = \
algo/x11/tribus-gate.c \
algo/x11/tribus.c \
algo/x11/tribus-4way.c \
algo/x11/timetravel-gate.c \
algo/x11/timetravel.c \
algo/x11/timetravel-4way.c \
algo/x11/timetravel10-gate.c \
algo/x11/timetravel10.c \
algo/x11/timetravel10-4way.c \
algo/x11/fresh.c \
algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \

View File

@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.7.10
4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
x11evo, blakecoin.
Faster x13sm3 (hsr).
Added share difficulty to accepted message.
v3.7.9
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.

View File

@@ -1,31 +1,22 @@
#include "blake-gate.h"
#include "sph_blake.h"
#if defined (__AVX__)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash1, 32 );
memcpy( state+96, hash1, 32 );
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
memcpy( &ctx, &blake_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// if (opt_benchmark)
// HTarget = 0x7f;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r14_4way_init( &blake_ctx );
blake256r14_4way( &blake_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
@@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
blakehash_4way( hash, vdata );
if ( hash[7] == 0 )
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] == 0 )
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
if ( fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] == 0 )
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
if ( fulltest( hash+8, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] == 0 )
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
if ( fulltest( hash+8, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
*hashes_done = n - first_nonce + 1;
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );

View File

@@ -491,14 +491,9 @@ do { \
(state)->T1 = T1; \
} while (0)
//#define BLAKE32_ROUNDS 8
#ifndef BLAKE32_ROUNDS
#define BLAKE32_ROUNDS 14
#endif
#if SPH_COMPACT_BLAKE_32
#define COMPRESS32_4WAY do { \
#define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -539,7 +534,7 @@ do { \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -563,80 +558,70 @@ do { \
// current impl
#define COMPRESS32_4WAY do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#define COMPRESS32_4WAY( rounds ) \
do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#endif
@@ -832,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
static void
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt)
const sph_u32 *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
}
static void
@@ -878,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY;
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
}
}
@@ -1079,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,
#endif
// default 14 rounds, backward copatibility
void
blake256_4way_init(void *cc)
{
blake32_4way_init(cc, IV256, salt_zero_small);
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
}
void
@@ -1094,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
void
blake256_4way_close(void *cc, void *dst)
{
blake256_4way_addbits_and_close(cc, 0, 0, dst);
blake32_4way_close(cc, 0, 0, dst, 8);
}
// 14 rounds blake, decred
void blake256r14_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
}
void
blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
blake256r14_4way(void *cc, const void *data, size_t len)
{
blake32_4way_close(cc, ub, n, dst, 8);
blake32_4way(cc, data, len);
}
void
blake256r14_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
// 8 rounds blakecoin, vanilla
void blake256r8_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 8 );
}
void
blake256r8_4way(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
void
blake256r8_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
#if defined (__AVX2__)

View File

@@ -35,7 +35,9 @@
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY___
#define __BLAKE_HASH_4WAY__
#ifdef __AVX__
#ifdef __cplusplus
extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BLAKE-256.
*/
#define SPH_SIZE_blake256 256
#if SPH_64
/**
* Output size (in bits) for BLAKE-512.
*/
#define SPH_SIZE_blake512 512
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context;
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst);
void blake256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
// 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way(void *cc, const void *data, size_t len);
void blake256r14_4way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_4way_small_context blake256r8_4way_context;
void blake256r8_4way_init(void *cc);
void blake256r8_4way(void *cc, const void *data, size_t len);
void blake256r8_4way_close(void *cc, void *dst);
#ifdef __AVX2__
@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
#endif
#endif
#endif

106
algo/blake/blakecoin-4way.c Normal file
View File

@@ -0,0 +1,106 @@
#include "blakecoin-gate.h"
#if defined (__AVX__)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
blake256r8_4way_context blakecoin_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r8_4way_init( &blakecoin_ctx );
blake256r8_4way( &blakecoin_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
blakecoin_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
// workaround to prevent flood of hash reports when nonce range exhasuted
// and thread is spinning waiting for new work
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
sleep(1);
}
return num_found;
}
#endif

View File

@@ -0,0 +1,71 @@
#include "blakecoin-gate.h"
#include <memory.h>
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
// return 0x3fffffLL;
}
// Blakecoin 4 way hashes so fast it runs out of nonces.
// This is an attempt to solve this but the result may be
// to rehash old nonces until new work is received.
void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
//
// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
// algo_gate.stratum_gen_work( &stratum, g_work );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|| ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) && clean_job )
/*
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
*/
{
work_free( work );
work_copy( work, g_work );
*nonceptr = 0xffffffffU / opt_n_threads * thr_id;
if ( opt_randomize )
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
// try incrementing the xnonce to chsnge the data
// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
}
else
++(*nonceptr);
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_4WAY)
// four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
// gate->get_new_work = (void*)&bc4w_get_new_work;
// blakecoin_4way_init( &blake_4way_init_ctx );
#else
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
// blakecoin_init( &blake_init_ctx );
#endif
gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}

View File

@@ -0,0 +1,21 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#define BLAKECOIN_4WAY
#endif
#if defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void blakecoinhash( void *state, const void *input );
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "blakecoin-gate.h"
#define BLAKE32_ROUNDS 8
#include "sph_blake.h"
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
}
*/
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}
*/

View File

@@ -1,5 +1,4 @@
#include "decred-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
@@ -9,7 +8,6 @@
#if defined (DECRED_4WAY)
static __thread blake256_4way_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
@@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input )
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
/*
sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash );
*/
/*
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
*/
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
printf(" hash mismatch, i = %u\n",i);
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
*(hash+2), *(hash+3) );
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
*(hash0+2), *(hash0+3) );
printf("\n");
*/
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy( state, hash, 32 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -69,21 +31,21 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) edata[48];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t _ALIGN(64) edata[48];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
ctx_midstate_done = false;
memcpy( edata, pdata, 180 );
// copy to buffer guaranteed to be aligned.
memcpy( edata, pdata, 180 );
// use the old way until new way updated for size.
mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n;
}
/*
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
printf("found 1\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+8 );
found[1] = true;
num_found++;
nonces[1] = n+1;
}
*/
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
work_set_target_ratio( work, hash+16 );
@@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[
num_found++;
nonces[2] = n+2;
}
/*
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
printf("found 3\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+24 );
found[3] = true;
num_found++;
nonces[3] = n+3;
}
*/
n += 2;
// n += 4;
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );

View File

@@ -1,4 +1,7 @@
#include "pentablake-gate.h"
#ifdef __AVX2__
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -9,8 +12,6 @@
//#define DEBUG_ALGO
#ifdef PENTABLAKE_4WAY
extern void pentablakehash_4way( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[128];

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#if defined(FOUR_WAY) && defined(__AVX2__)
#define PENTABLAKE_4WAY
#endif

View File

@@ -41,19 +41,13 @@
extern "C"{
#endif
//#include "sph_bmw.h"
//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
#define SPH_SMALL_FOOTPRINT_BMW 1
//#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
//#undef SPH_ROTL64
//#define SPH_ROTL64(x,n) (((x) << (n)) | ((x) >> (64 - (n))))
//#define SPH_ROTL64(x,n) mm256_rotl_64(x,n)
#define LPAR (
// BMW256
static const sph_u32 IV256[] = {
SPH_C32(0x40414243), SPH_C32(0x44454647),
@@ -66,8 +60,7 @@ static const sph_u32 IV256[] = {
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
};
#if SPH_64
// BMW512
static const sph_u64 IV512[] = {
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
@@ -79,74 +72,113 @@ static const sph_u64 IV512[] = {
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
};
#endif
// BMW256
#define XCAT(x, y) XCAT_(x, y)
#define XCAT_(x, y) x ## y
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm_rotl_32( (x), 4), \
mm_rotl_32( (x), 19) ) )
#define LPAR (
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 8), \
mm_rotl_32( (x), 23) ) )
/*
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23))
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
#define ss4(x) (((x) >> 1) ^ (x))
#define ss5(x) (((x) >> 2) ^ (x))
#define rs1(x) SPH_ROTL32(x, 3)
#define rs2(x) SPH_ROTL32(x, 7)
#define rs3(x) SPH_ROTL32(x, 13)
#define rs4(x) SPH_ROTL32(x, 16)
#define rs5(x) SPH_ROTL32(x, 19)
#define rs6(x) SPH_ROTL32(x, 23)
#define rs7(x) SPH_ROTL32(x, 27)
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm_rotl_32( (x), 12), \
mm_rotl_32( (x), 25) ) )
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 15), \
mm_rotl_32( (x), 29) ) )
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
#define expand1s_inner(qf, mf, hf, i16, \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
i9, i10, i11, i12, i13, i14, i15, \
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
#define expand1s(qf, mf, hf, i16) \
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
#define expand1s_(qf, mf, hf, i16, ix, iy) \
expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
#define rs1(x) mm_rotl_32( x, 3 )
#define rs2(x) mm_rotl_32( x, 7 )
#define rs3(x) mm_rotl_32( x, 13 )
#define rs4(x) mm_rotl_32( x, 16 )
#define rs5(x) mm_rotl_32( x, 19 )
#define rs6(x) mm_rotl_32( x, 23 )
#define rs7(x) mm_rotl_32( x, 27 )
#define expand2s_inner(qf, mf, hf, i16, \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
i9, i10, i11, i12, i13, i14, i15, \
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define rol_off_32( M, j, off ) \
mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define expand2s(qf, mf, hf, i16) \
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
#define expand2s_(qf, mf, hf, i16, ix, iy) \
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
*/
#if SPH_64
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-16 ] ), \
ss2( qt[ (i)-15 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-14 ] ), \
ss0( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-12 ] ), \
ss2( qt[ (i)-11 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-10 ] ), \
ss0( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
ss2( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
ss0( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
ss2( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
ss0( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
_mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
_mm_add_epi64( \
_mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
_mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
ss5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
// BMW512
#define sb0(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
_mm256_slli_epi64( (x), 3) ), \
_mm256_xor_si256( mm256_rotl_64( (x), 4), \
_mm256_xor_si256( mm256_rotl_64( (x), 4), \
mm256_rotl_64( (x), 37) ) )
#define sb1(x) \
@@ -181,18 +213,18 @@ static const sph_u64 IV512[] = {
#define rb6(x) mm256_rotl_64( x, 43 )
#define rb7(x) mm256_rotl_64( x, 53 )
#define rol_off( M, j, off ) \
mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
( ( (j) + (off) ) & 15 ) + 1 )
#define rol_off_64( M, j, off ) \
mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_b( M, H, j ) \
_mm256_xor_si256( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
rol_off( M, j, 3 ) ), \
rol_off( M, j, 10 ) ), \
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
rol_off_64( M, j, 3 ) ), \
rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 15 ] )
H[ ( (j)+7 ) & 0xF ] )
#define expand1b( qt, M, H, i ) \
_mm256_add_epi64( \
@@ -241,132 +273,301 @@ static const sph_u64 IV512[] = {
sb5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_b( M, H, (i)-16 ) )
#endif
// BMW256
/*
#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
*/
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[14], H[14] ) )
/*
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13)
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14)
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15)
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13)
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14)
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
#define Ws1 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[14], H[14] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#if SPH_SMALL_FOOTPRINT_BMW
#define Ws2 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \
unsigned u; \
sph_u32 Ws[16]; \
Ws[ 0] = Ws0; \
Ws[ 1] = Ws1; \
Ws[ 2] = Ws2; \
Ws[ 3] = Ws3; \
Ws[ 4] = Ws4; \
Ws[ 5] = Ws5; \
Ws[ 6] = Ws6; \
Ws[ 7] = Ws7; \
Ws[ 8] = Ws8; \
Ws[ 9] = Ws9; \
Ws[10] = Ws10; \
Ws[11] = Ws11; \
Ws[12] = Ws12; \
Ws[13] = Ws13; \
Ws[14] = Ws14; \
Ws[15] = Ws15; \
for (u = 0; u < 15; u += 5) { \
qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
} \
qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
} while (0)
#define Ws3 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[13], H[13] ) )
#define MAKE_Qbs do { \
qt[16] = expand1s(Qs, M, H, 16); \
qt[17] = expand1s(Qs, M, H, 17); \
qt[18] = expand2s(Qs, M, H, 18); \
qt[19] = expand2s(Qs, M, H, 19); \
qt[20] = expand2s(Qs, M, H, 20); \
qt[21] = expand2s(Qs, M, H, 21); \
qt[22] = expand2s(Qs, M, H, 22); \
qt[23] = expand2s(Qs, M, H, 23); \
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#define Ws4 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#else
#define Ws5 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
} while (0)
#define Ws6 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[13], H[13] ) )
#define MAKE_Qbs do { \
qt[16] = expand1s(Qs, M, H, 16); \
qt[17] = expand1s(Qs, M, H, 17); \
qt[18] = expand2s(Qs, M, H, 18); \
qt[19] = expand2s(Qs, M, H, 19); \
qt[20] = expand2s(Qs, M, H, 20); \
qt[21] = expand2s(Qs, M, H, 21); \
qt[22] = expand2s(Qs, M, H, 22); \
qt[23] = expand2s(Qs, M, H, 23); \
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#endif
#define Ws8 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qs do { \
MAKE_Qas; \
MAKE_Qbs; \
} while (0)
#define Ws9 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#define Qs(j) (qt[j])
*/
#if SPH_64
#define Ws10 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define Ws11 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) )
#define Ws12 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[10], H[10] ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[11], H[11] ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[12], H[12] ) )
#define Ws15 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[13], H[13] ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
{
__m128i qt[32], xl, xh; \
qt[ 0] = ss0( Ws0 ) + H[ 1];
qt[ 1] = ss1( Ws1 ) + H[ 2];
qt[ 2] = ss2( Ws2 ) + H[ 3];
qt[ 3] = ss3( Ws3 ) + H[ 4];
qt[ 4] = ss4( Ws4 ) + H[ 5];
qt[ 5] = ss0( Ws5 ) + H[ 6];
qt[ 6] = ss1( Ws6 ) + H[ 7];
qt[ 7] = ss2( Ws7 ) + H[ 8];
qt[ 8] = ss3( Ws8 ) + H[ 9];
qt[ 9] = ss4( Ws9 ) + H[10];
qt[10] = ss0( Ws10) + H[11];
qt[11] = ss1( Ws11) + H[12];
qt[12] = ss2( Ws12) + H[13];
qt[13] = ss3( Ws13) + H[14];
qt[14] = ss4( Ws14) + H[15];
qt[15] = ss0( Ws15) + H[ 0];
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
qt[19] = expand2s( qt, M, H, 19 );
qt[20] = expand2s( qt, M, H, 20 );
qt[21] = expand2s( qt, M, H, 21 );
qt[22] = expand2s( qt, M, H, 22 );
qt[23] = expand2s( qt, M, H, 23 );
qt[24] = expand2s( qt, M, H, 24 );
qt[25] = expand2s( qt, M, H, 25 );
qt[26] = expand2s( qt, M, H, 26 );
qt[27] = expand2s( qt, M, H, 27 );
qt[28] = expand2s( qt, M, H, 28 );
qt[29] = expand2s( qt, M, H, 29 );
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
_mm_xor_si128( qt[18], qt[19] ) ),
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
_mm_xor_si128( qt[22], qt[23] ) ) );
xh = _mm_xor_si128( xl,
_mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
_mm_xor_si128( qt[26], qt[27] ) ),
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
_mm_xor_si128( qt[30], qt[31] ) )));
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
}
// BMW512
#define Wb0 \
_mm256_add_epi64( \
@@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[29] = expand2b( qt, M, H, 29 );
qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 );
xl = _mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
_mm256_xor_si256( qt[18], qt[19] ) ),
@@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[26], qt[27] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
_mm256_xor_si256( qt[30], qt[31] ) )));
dH[ 0] = _mm256_add_epi64(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
@@ -657,137 +860,113 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[22], qt[15] ) ) );
}
#endif // 64
// BMW256
//#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
/*
static void
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
static const __m128i final_s[16] =
{
#define M(x) sph_dec32le_aligned(data + 4 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
FOLDs;
#undef M
#undef H
#undef dH
}
static const sph_u32 final_s[16] = {
SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
SPH_C32(0xaaaaaaaf)
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
{
memcpy(sc->H, iv, sizeof sc->H);
sc->ptr = 0;
#if SPH_64
sc->bit_count = 0;
#else
sc->bit_count_high = 0;
sc->bit_count_low = 0;
#endif
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
sc->ptr = 0;
sc->bit_count = 0;
}
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
unsigned char *buf;
size_t ptr;
sph_u32 htmp[16];
sph_u32 *h1, *h2;
#if !SPH_64
sph_u32 tmp;
#endif
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
#if SPH_64
sc->bit_count += (sph_u64)len << 3;
#else
tmp = sc->bit_count_low;
sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
if (sc->bit_count_low < tmp)
sc->bit_count_high ++;
sc->bit_count_high += len >> 29;
#endif
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;
h2 = htmp;
while (len > 0) {
size_t clen;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
data = (const unsigned char *)data + clen;
len -= clen;
ptr += clen;
if (ptr == sizeof sc->buf) {
sph_u32 *ht;
compress_small(buf, h1, h2);
ht = h1;
h1 = h2;
h2 = ht;
ptr = 0;
}
}
sc->ptr = ptr;
if (h1 != sc->H)
memcpy(sc->H, h1, sizeof sc->H);
sc->bit_count += (sph_u64)len << 3;
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;
h2 = htmp;
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
h2 = ht;
ptr = 0;
}
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
unsigned char *buf, *out;
size_t ptr, u, v;
unsigned z;
sph_u32 h1[16], h2[16], *h;
__m128i *buf;
__m128i h1[16], h2[16], *h;
size_t ptr, u, v;
unsigned z;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
h = sc->H;
if (ptr > (sizeof sc->buf) - 8) {
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
compress_small(buf, h, h1);
ptr = 0;
h = h1;
}
memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
#if SPH_64
sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
SPH_T64(sc->bit_count + n));
#else
sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
sc->bit_count_low + n);
sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
SPH_T32(sc->bit_count_high));
#endif
compress_small(buf, h, h2);
for (u = 0; u < 16; u ++)
sph_enc32le_aligned(buf + 4 * u, h2[u]);
compress_small(buf, final_s, h1);
out = dst;
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
sph_enc32le(out + 4 * u, h1[v]);
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
buf[ ptr>>2 ] = _mm_set1_epi32( z );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 );
buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
}
*/
#if SPH_64
// BMW512
static const __m256i final_b[16] =
{
@@ -908,33 +1087,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
casti_m256i(dst,u) = h1[v];
}
#endif
// BMW256
void
bmw256_4way_init(void *cc)
{
// bmw32_4way_init(cc, IV256);
bmw32_4way_init(cc, IV256);
}
void
bmw256_4way(void *cc, const void *data, size_t len)
{
// bmw32_4way(cc, data, len);
bmw32_4way(cc, data, len);
}
void
bmw256_4way_close(void *cc, void *dst)
{
// bmw256_4way_addbits_and_close(cc, 0, 0, dst);
bmw256_4way_addbits_and_close(cc, 0, 0, dst);
}
void
bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
// bmw32_4way_close(cc, ub, n, dst, 8);
bmw32_4way_close(cc, ub, n, dst, 8);
}
#if SPH_64
// BMW512
void
bmw512_4way_init(void *cc)
@@ -960,10 +1139,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
bmw64_4way_close(cc, ub, n, dst, 8);
}
#endif
#ifdef __cplusplus
}
#endif
#endif
#endif // __AVX2__

View File

@@ -46,94 +46,37 @@ extern "C"{
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BMW-224.
*/
#define SPH_SIZE_bmw224 224
/**
* Output size (in bits) for BMW-256.
*/
#define SPH_SIZE_bmw256 256
#if SPH_64
/**
* Output size (in bits) for BMW-384.
*/
#define SPH_SIZE_bmw384 384
/**
* Output size (in bits) for BMW-512.
*/
#define SPH_SIZE_bmw512 512
#endif
/**
* This structure is a context for BMW-224 and BMW-256 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
size_t ptr;
sph_u32 H[16];
#if SPH_64
sph_u64 bit_count;
#else
sph_u32 bit_count_high, bit_count_low;
#endif
#endif
__m128i buf[64];
__m128i H[16];
size_t ptr;
sph_u32 bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
typedef bmw_4way_small_context bmw256_4way_context;
#if SPH_64
/**
* This structure is a context for BMW-384 and BMW-512 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
__m256i buf[16];
__m256i H[16];
// unsigned char buf[128]; /* first field, for alignment */
size_t ptr;
// sph_u64 H[16];
sph_u64 bit_count;
#endif
size_t ptr;
sph_u64 bit_count;
} bmw_4way_big_context;
typedef bmw_4way_big_context bmw512_4way_context;
#endif
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_addbits_and_close(
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#if SPH_64
void bmw512_4way_init(void *cc);
void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
#endif
#endif
#endif

View File

@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask, mask0, mask1;
__m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1;
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
@@ -40,127 +40,69 @@ void jha_hash_4way( void *out, const void *input )
keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
// select next function based on bit 0 of previous hash.
// Specutively execute both functions and use mask to
// select results from correct function for each lane.
// hash = mask : vhash0 ? vhash1
mask = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
// second version
// mask0 = mask
// mask1 = mm256_not( mask );
// first version
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) vs skein
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 );
(char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1,
(char*)hash1, 512 );
(char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2,
(char*)hash2, 512 );
(char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash1 );
skein512_4way_close( &ctx_skein, vhashB );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
// blend should be faster
vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
// second version
// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
// _mm256_and_si256( vh1[i], mask1 ) );
// first version
/*
vh0[i] = _mm256_maskload_epi64(
vhash0 + i*4, mm256_not( mask ) );
vh1[i] = _mm256_maskload_epi64(
vhash1 + i*4, mask );
vh[i] = _mm256_or_si256( vh0[i], vh1[i] );
*/
}
// blake v jh
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhash0 );
blake512_4way_close( &ctx_blake, vhashA );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash1 );
jh512_4way_close( &ctx_jh, vhashB );
// merge hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
}
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint64_t htmax[] = {
uint64_t htmax[] = {
0,
0xF,
0xFF,
@@ -168,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
@@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0
};
// we need bigendian data...
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );
// keccak512_4way( &jha_kec_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
@@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep3, n+3 );
jha_hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
@@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}

128
algo/lyra2/lyra2h-4way.c Normal file
View File

@@ -0,0 +1,128 @@
#include "lyra2h-gate.h"
#ifdef LYRA2H_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
__thread uint64_t* lyra2h_4way_matrix;
bool lyra2h_4way_thread_init()
{
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
lyra2h_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2h_4way_hash( hash, vdata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

25
algo/lyra2/lyra2h-gate.c Normal file
View File

@@ -0,0 +1,25 @@
#include "lyra2h-gate.h"
#include "lyra2.h"
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
#ifdef LYRA2H_4WAY
gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h_4way;
gate->hash = (void*)&lyra2h_4way_hash;
#else
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

32
algo/lyra2/lyra2h-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef LYRA2H_GATE_H__
#define LYRA2H_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#define LYRA2H_4WAY
#endif
#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8
#if defined(LYRA2H_4WAY)
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_4way_thread_init();
#endif
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_thread_init();
#endif

View File

@@ -1,6 +1,6 @@
#include "lyra2h-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;
bool lyra2h_thread_init()
{
const int i = 16 * 16 * 96;
lyra2h_matrix = _mm_malloc( i, 64 );
lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
return lyra2h_matrix;
}
@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

View File

@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
}

177
algo/lyra2/lyra2rev2-4way.c Normal file
View File

@@ -0,0 +1,177 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#ifdef __AVX2__
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
sph_bmw256_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
void init_lyra2rev2_4way_ctx()
{
// blake256_4way_init( &l2v2_4way_ctx.blake );
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
sph_bmw256_init( &l2v2_4way_ctx.bmw );
}
void lyra2rev2_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
sph_bmw256( &ctx.bmw, hash0, 32 );
sph_bmw256_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash1, 32 );
sph_bmw256_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash2, 32 );
sph_bmw256_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash3, 32 );
sph_bmw256_close( &ctx.bmw, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,38 @@
#include "lyra2rev2-gate.h"
__thread uint64_t* l2v2_wholeMatrix;
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
init_lyra2rev2_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -0,0 +1,35 @@
#ifndef LYRA2REV2_GATE_H__
#define LYRA2REV2_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(HASH_4WAY)
#define LYRA2REV2_4WAY
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_4way_ctx();
#endif
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_ctx();
#endif

View File

@@ -1,20 +1,12 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#include "algo-gate-api.h"
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h"
#include "avxdefs.h"
// This gets allocated when miner_thread starts up and is never freed.
// It's not a leak because the only way to allocate it again is to exit
// the thread and that only occurs when the entire program exits.
__thread uint64_t* l2v2_wholeMatrix;
//#include "lyra2.h"
typedef struct {
cubehashParam cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
return 0;
}
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -85,8 +85,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{

View File

@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
return lyra2z_wholeMatrix;
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
hash_str,
target_str);
}
work_set_target_ratio( work, hash );
pdata[19] = data[19];
goto out;
}

View File

@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
pdata[0] = tmpdata[0];
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
work_set_target_ratio( work, hash );
if (opt_debug)
applog(LOG_INFO, "found nonce %x", nonce);
return 1;

207
algo/quark/quark-4way.c Normal file
View File

@@ -0,0 +1,207 @@
#include "cpuminer-config.h"
#include "quark-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} quark_4way_ctx_holder;
quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
void init_quark_4way_ctx()
{
blake512_4way_init( &quark_4way_ctx.blake );
bmw512_4way_init( &quark_4way_ctx.bmw );
init_groestl( &quark_4way_ctx.groestl, 64 );
skein512_4way_init( &quark_4way_ctx.skein );
jh512_4way_init( &quark_4way_ctx.jh );
keccak512_4way_init( &quark_4way_ctx.keccak );
}
void quark_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
quark_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash );
}
if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash );
}
if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/quark/quark-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "quark-gate.h"
bool register_quark_algo( algo_gate_t* gate )
{
#if defined (QUARK_4WAY)
init_quark_4way_ctx();
gate->scanhash = (void*)&scanhash_quark_4way;
gate->hash = (void*)&quark_4way_hash;
#else
init_quark_ctx();
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

32
algo/quark/quark-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUARK_GATE_H__
#define QUARK_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define QUARK_4WAY
#endif
bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "quark-gate.h"
#include <stdio.h>
#include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
#endif
}
inline static void quarkhash(void *state, const void *input)
void quark_hash(void *state, const void *input)
{
unsigned char hashbuf[128];
size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
quarkhash(hash64, &endiandata);
quark_hash(hash64, &endiandata);
if ((hash64[7]&0xFFFFFF00)==0)
{
if (fulltest(hash64, ptarget))
{
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_quark_algo( algo_gate_t* gate )
{
init_quark_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quarkhash;
return true;
};

View File

@@ -122,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -134,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[i * 20 + 19];
work_set_target_ratio( work, hash );
return 1;
}
}

View File

@@ -114,7 +114,7 @@ available_implementations() {
return flags;
}
#endif
/*
static int
scrypt_test_mix() {
static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {
return ret;
}
*/

View File

@@ -26,7 +26,7 @@
#include "scrypt-jane-pbkdf2.h"
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
/*
static int
scrypt_test_hash() {
scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
scrypt_hash_finish(&st, final);
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
}
*/

View File

@@ -342,17 +342,6 @@ do { \
do { \
sph_u64 t0, t1, t2; \
__m256i h8; \
/* can LE be assumed? \
dec64le does nothing when SPH_LITTLE endian is set, as it is. \
__m256i m0 = _mm256_dec64le( buf ); \
__m256i m1 = _mm256_dec64le( buf + 8*4 ); \
__m256i m2 = _mm256_dec64le( buf + 16*4 ); \
__m256i m3 = _mm256_dec64le( buf + 24*4 ); \
__m256i m4 = _mm256_dec64le( buf + 32*4 ); \
__m256i m5 = _mm256_dec64le( buf + 40*4 ); \
__m256i m6 = _mm256_dec64le( buf + 48*4 ); \
__m256i m7 = _mm256_dec64le( buf + 56*4 ); \
*/ \
__m256i m0 = buf[0]; \
__m256i m1 = buf[1]; \
__m256i m2 = buf[2]; \

View File

@@ -39,7 +39,9 @@
*/
#ifndef __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__ 1
#ifdef __AVX2__
#ifdef __cplusplus
extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
#define SPH_SIZE_skein256 256
#define SPH_SIZE_skein512 512
#ifdef __AVX2__
typedef struct {
__m256i buf[8] __attribute__ ((aligned (32)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein512_4way_context;
} sph_skein_4way_big_context;
typedef sph_skein_4way_big_context skein512_4way_context;
typedef sph_skein_4way_big_context skein256_4way_context;
void skein512_4way_init(void *cc);
void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
//void sph_skein512_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[8] __attribute__ ((aligned (32)));
__m128i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein256_4way_context;
void skein256_4way_init(void *cc);
void skein256_4way(void *cc, const void *data, size_t len);
void skein256_4way_close(void *cc, void *dst);
//void sph_skein256_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif
#endif

231
algo/sm3/sm3-hash-4way.c Normal file
View File

@@ -0,0 +1,231 @@
/* ====================================================================
* Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#include <string.h>
#include "sm3-hash-4way.h"
#ifdef __AVX__
void sm3_4way_init( sm3_4way_ctx_t *ctx )
{
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
}
void sm3_4way( void *cc, const void *data, size_t len )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *block = (__m128i*)ctx->block;
__m128i *vdata = (__m128i*)data;
if ( ctx->num )
{
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
if ( len < left )
{
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
ctx->num += len;
return;
}
else
{
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
sm3_4way_compress( ctx->digest, block );
ctx->nblocks++;
vdata += left>>2;
len -= left;
}
}
while ( len >= SM3_BLOCK_SIZE )
{
sm3_4way_compress( ctx->digest, vdata );
ctx->nblocks++;
vdata += SM3_BLOCK_SIZE>>2;
len -= SM3_BLOCK_SIZE;
}
ctx->num = len;
if ( len )
memcpy_128( block, vdata, len>>2 );
}
void sm3_4way_close( void *cc, void *dst )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *hash = (__m128i*)dst;
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m128i *block = (__m128i*)ctx->block;
int i;
block[ctx->num] = _mm_set1_epi32( 0x80 );
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
}
else
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
sm3_4way_compress( ctx->digest, block );
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm_byteswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] );
}
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
mm_rotl_32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
mm_rotl_32( x, 23 ) ) )
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
_mm_and_si128( x, z ) ), \
_mm_and_si128( y, z ) )
#define GG0(x,y,z) FF0(x,y,z)
#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
_mm_andnot_si128( x, z ) )
void sm3_4way_compress( __m128i *digest, __m128i *block )
{
__m128i W[68], W1[64];
__m128i A = digest[ 0 ];
__m128i B = digest[ 1 ];
__m128i C = digest[ 2 ];
__m128i D = digest[ 3 ];
__m128i E = digest[ 4 ];
__m128i F = digest[ 5 ];
__m128i G = digest[ 6 ];
__m128i H = digest[ 7 ];
__m128i SS1, SS2, TT1, TT2, T;
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
W[ j-9 ] ),
mm_rotl_32( W[ j-3 ], 15 ) ) ),
_mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
W[ j-6 ] ) );
for( j = 0; j < 64; j++ )
W1[j] = _mm_xor_si128( W[j], W[j+4] );
T = _mm_set1_epi32( 0x79CC4519UL );
for( j =0; j < 16; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
T = _mm_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
digest[0] = _mm_xor_si128( digest[0], A );
digest[1] = _mm_xor_si128( digest[1], B );
digest[2] = _mm_xor_si128( digest[2], C );
digest[3] = _mm_xor_si128( digest[3], D );
digest[4] = _mm_xor_si128( digest[4], E );
digest[5] = _mm_xor_si128( digest[5], F );
digest[6] = _mm_xor_si128( digest[6], G );
digest[7] = _mm_xor_si128( digest[7], H );
}
#endif

89
algo/sm3/sm3-hash-4way.h Normal file
View File

@@ -0,0 +1,89 @@
/* ====================================================================
* Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#ifndef SPH_SM3_HASH_4WAY_H
#define SPH_SM3_HASH_4WAY_H
#define SM3_DIGEST_LENGTH 32
#define SM3_BLOCK_SIZE 64
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "avxdefs.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
__m128i block[16] __attribute__ ((aligned (64)));
__m128i digest[8];
uint32_t nblocks;
uint32_t num;
} sm3_4way_ctx_t;
void sm3_4way_init( sm3_4way_ctx_t *ctx );
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
// size_t data_len );
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
// unsigned char digest[SM3_DIGEST_LENGTH] );
void sm3_4way_compress( __m128i *digest, __m128i *block );
void sm3_4way(void *cc, const void *data, size_t len);
void sm3_4way_close(void *cc, void *dst);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
for(j =16; j < 64; j++) {
T[j] = 0x7A879D8A;
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
SS2 = SS1 ^ ROTATELEFT(A,12);
TT1 = FF1(A,B,C) + D + SS2 + W1[j];
TT2 = GG1(E,F,G) + H + SS1 + W[j];

View File

@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
for (i = 0; i < 8; i ++) \
sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
}
// sph_ ## name ## _init(cc); \
//}
/*
sph_ ## name ## _init(cc); \
}
*/
MAKE_CLOSE(whirlpool)
MAKE_CLOSE(whirlpool0)
MAKE_CLOSE(whirlpool1)

View File

@@ -22,6 +22,7 @@ void whirlpool_hash( void *state, const void *input );
int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_whirlpool_ctx();
#endif
#endif

View File

@@ -3345,8 +3345,10 @@ do { \
#define READ_STATE MUL8(READ_STATE_W)
#define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
//#define BYTE(x, n) \
// _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
/*
#define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
*/
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)

View File

@@ -162,7 +162,8 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );

274
algo/x11/timetravel-4way.c Normal file
View File

@@ -0,0 +1,274 @@
#include "timetravel-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
} tt8_4way_ctx_holder;
tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
void init_tt8_4way_ctx()
{
blake512_4way_init( &tt8_4way_ctx.blake );
bmw512_4way_init( &tt8_4way_ctx.bmw );
init_groestl( &tt8_4way_ctx.groestl, 64 );
skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
};
void timetravel_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel-gate.h"
void tt8_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL_4WAY
init_tt8_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel_4way;
gate->hash = (void*)&timetravel_4way_hash;
#else
init_tt8_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
#endif
gate->set_target = (void*)&tt8_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
void tt8_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,40 @@
#ifndef TIMETRAVEL_GATE_H__
#define TIMETRAVEL_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL_4WAY
#endif
// Machinecoin Genesis Timestamp
#define TT8_FUNC_BASE_TIMESTAMP 1389040865
#define TT8_FUNC_COUNT 8
#define TT8_FUNC_COUNT_PERMUTATIONS 40320
void tt8_next_permutation( int *pbegin, int *pend );
bool register_timetravel_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL_4WAY)
void timetravel_4way_hash( void *state, const void *input );
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_4way_ctx();
#endif
void timetravel_hash( void *state, const void *input );
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_ctx();
#endif

View File

@@ -1,11 +1,9 @@
#include "algo-gate-api.h"
#include "timetravel-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"
#else
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
// Machinecoin Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1389040865
#define HASH_FUNC_COUNT 8
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
typedef struct {
sph_blake512_context blake;
@@ -101,7 +38,7 @@ typedef struct {
tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
__thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));
void init_tt_ctx()
void init_tt8_ctx()
{
sph_blake512_init( &tt_ctx.blake );
sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()
void timetravel_hash(void *output, const void *input)
{
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB;
tt_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{
if (i == 0)
{
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
}
}
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
}
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp;
// do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
void timetravel_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
gate->set_target = (void*)&timetravel_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -0,0 +1,316 @@
#include "timetravel10-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
} tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
void init_tt10_4way_ctx()
{
blake512_4way_init( &tt10_4way_ctx.blake );
bmw512_4way_init( &tt10_4way_ctx.bmw );
init_groestl( &tt10_4way_ctx.groestl, 64 );
skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 );
};
void timetravel10_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel10_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel10-gate.h"
void tt10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL10_4WAY
init_tt10_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel10_4way;
gate->hash = (void*)&timetravel10_4way_hash;
#else
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
#endif
gate->set_target = (void*)&tt10_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
void tt10_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,39 @@
#ifndef TIMETRAVEL10_GATE_H__
#define TIMETRAVEL10_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL10_4WAY
#endif
// BitCore Genesis Timestamp
#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
#define TT10_FUNC_COUNT 10
#define TT10_FUNC_COUNT_PERMUTATIONS 40320
void tt10_next_permutation( int *pbegin, int *pend );
bool register_timetravel10_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL10_4WAY)
void timetravel10_4way_hash( void *state, const void *input );
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done );
void init_tt10_4way_ctx();
#endif
void timetravel10_hash( void *state, const void *input );
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt10_ctx();
#endif

View File

@@ -1,11 +1,8 @@
#include "algo-gate-api.h"
#include "timetravel10-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
// BitCore Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
#define HASH_FUNC_COUNT 10
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
typedef struct {
sph_blake512_context blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()
void timetravel10_hash(void *output, const void *input)
{
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB;
tt10_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)
memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{
if (i == 0)
{
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
}
}
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
}
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ )
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT );
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp;
// do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
{
work_set_target_ratio( work, hash );
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void timetravel10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
gate->set_target = (void*)&timetravel10_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
}
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
/*
bool register_x11_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

340
algo/x11/x11evo-4way.c Normal file
View File

@@ -0,0 +1,340 @@
#include "cpuminer-config.h"
#include "x11evo-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <string.h>
#include <stdint.h>
#include <compat/portable_endian.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11evo_4way_ctx_holder;
static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
void init_x11evo_4way_ctx()
{
blake512_4way_init( &x11evo_4way_ctx.blake );
bmw512_4way_init( &x11evo_4way_ctx.bmw );
init_groestl( &x11evo_4way_ctx.groestl, 64 );
skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 );
}
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
void x11evo_4way_hash( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
if ( s_seq == -1 )
{
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = data[17];
evo_twisted_code( ntime, hashOrder );
}
int i;
int len = strlen( hashOrder );
for ( i = 0; i < len; i++ )
{
char elem = hashOrder[i];
uint8_t idx;
if ( elem >= 'A' )
idx = elem - 'A' + 10;
else
idx = elem - '0';
// int size = 64;
switch ( idx )
{
case 0:
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*) hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 10:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
}
}
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
//static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
swab32_array( endiandata, pdata, 20 );
int ntime = endiandata[17];
if ( ntime != s_ntime || s_seq == -1 )
{
evo_twisted_code( ntime, hashOrder );
s_ntime = ntime;
}
uint32_t hmask = 0xFFFFFFFF;
if ( Htarg > 0 )
{
if ( Htarg <= 0xF )
hmask = 0xFFFFFFF0;
else if ( Htarg <= 0xFF )
hmask = 0xFFFFFF00;
else if ( Htarg <= 0xFFF )
hmask = 0xFFFF000;
else if ( Htarg <= 0xFFFF )
hmask = 0xFFFF000;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11evo_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

95
algo/x11/x11evo-gate.c Normal file
View File

@@ -0,0 +1,95 @@
#include "x11evo-gate.h"
int s_seq = -1;
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[X11EVO_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, X11EVO_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, X11EVO_FUNC_COUNT );
sptr = str;
for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
void evo_twisted_code( uint32_t ntime, char *permstr )
{
int seq = getCurrentAlgoSeq( ntime );
if ( s_seq != seq )
{
getAlgoString( permstr, seq );
s_seq = seq;
}
}
bool register_x11evo_algo( algo_gate_t* gate )
{
#if defined (X11EVO_4WAY)
init_x11evo_4way_ctx();
gate->scanhash = (void*)&scanhash_x11evo_4way;
gate->hash = (void*)&x11evo_4way_hash;
#else
init_x11evo_ctx();
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

39
algo/x11/x11evo-gate.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef X11EVO_GATE_H__
#define X11EVO_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11EVO_4WAY
#endif
#define X11EVO_INITIAL_DATE 1462060800
#define X11EVO_FUNC_COUNT 11
extern int s_seq;
bool register_x11evo_algo( algo_gate_t* gate );
#if defined(X11EVO_4WAY)
void x11evo_4way_hash( void *state, const void *input );
int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_4way_ctx();
#endif
void x11evo_hash( void *state, const void *input );
int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_ctx();
void evo_twisted_code( uint32_t ntime, char *permstr );
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "x11evo-gate.h"
#include <string.h>
#include <stdint.h>
@@ -26,9 +26,6 @@
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#define INITIAL_DATE 1462060800
#define HASH_FUNC_COUNT 11
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
sph_shavite512_init( &x11evo_ctx.shavite );
}
/*
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
{
return (current_time - base_time) / (60 * 60 * 24);
}
*/
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[HASH_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, HASH_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, HASH_FUNC_COUNT );
sptr = str;
for ( j = 0; j < HASH_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
static int s_seq = -1;
static void evo_twisted_code(uint32_t ntime, char *permstr)
{
int seq = getCurrentAlgoSeq(ntime);
if (s_seq != seq)
{
getAlgoString(permstr, seq);
s_seq = seq;
}
}
static inline void x11evo_hash( void *state, const void *input )
void x11evo_hash( void *state, const void *input )
{
uint32_t hash[16] __attribute__ ((aligned (64)));
x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
memcpy( state, hash, 32 );
}
static const uint32_t diff1targ = 0x0000ffff;
//static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done )
uint64_t *hashes_done )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
else if ( Htarg <= 0xFFF )
hmask = 0xFFFF000;
else if ( Htarg <= 0xFFFF )
hmask = 0xFFFF000;
hmask = 0xFFFF000;
}
do
{
pdata[19] = ++n;
be32enc( &endiandata[19], n );
x11evo_hash( hash64, &endiandata );
x11evo_hash( hash64, endiandata );
if ( ( hash64[7] & hmask ) == 0 )
{
if ( fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
}
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
bool register_x11evo_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
init_x11evo_ctx();
return true;
};

View File

@@ -161,6 +161,7 @@ int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;

View File

@@ -116,6 +116,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}

View File

@@ -70,6 +70,7 @@ int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
nonce++;

View File

@@ -234,6 +234,7 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -17,7 +17,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sph_sm3.h"
#include "algo/sm3/sm3-hash-4way.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
@@ -33,7 +33,7 @@ typedef struct {
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sm3_ctx_t sm3;
sm3_4way_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
} x13sm3_4way_ctx_holder;
@@ -54,7 +54,7 @@ void init_x13sm3_4way_ctx()
sph_shavite512_init( &x13sm3_4way_ctx.shavite );
init_sd( &x13sm3_4way_ctx.simd, 512 );
init_echo( &x13sm3_4way_ctx.echo, 512 );
sm3_init( &x13sm3_4way_ctx.sm3 );
sm3_4way_init( &x13sm3_4way_ctx.sm3 );
sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
sph_fugue512_init( &x13sm3_4way_ctx.fugue );
};
@@ -85,14 +85,11 @@ void x13sm3_4way_hash( void *state, const void *input )
// Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
@@ -178,6 +175,8 @@ void x13sm3_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// SM3
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
@@ -187,17 +186,11 @@ void x13sm3_4way_hash( void *state, const void *input )
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sph_sm3( &ctx.sm3, hash0, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash0 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash1, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash1 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash2, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash2 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash3, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash3 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3,
sm3_vhash, 1024 );
// Hamsi
sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );

View File

@@ -224,6 +224,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -65,6 +65,7 @@ int scanhash_axiom(int thr_id, struct work *work,
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
work_set_target_ratio( work, hash64 );
return true;
}
n++;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#if defined(__AVX2__) && defined(__AES__)
#define POLYTIMOS_4WAY
#endif

View File

@@ -233,6 +233,7 @@ int scanhash_x14(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -47,7 +47,6 @@ void init_x15_4way_ctx()
{
blake512_4way_init( &x15_4way_ctx.blake );
bmw512_4way_init( &x15_4way_ctx.bmw );
sph_bmw512_init( &x15_4way_ctx.bmw );
init_groestl( &x15_4way_ctx.groestl, 64 );
skein512_4way_init( &x15_4way_ctx.skein );
jh512_4way_init( &x15_4way_ctx.jh );

View File

@@ -245,6 +245,7 @@ int scanhash_x15(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -266,6 +266,7 @@ int scanhash_x17(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
@@ -281,13 +282,3 @@ int scanhash_x17(int thr_id, struct work *work,
pdata[19] = n;
return 0;
}
/*
bool register_x17_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x17_ctx();
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17hash;
return true;
};
*/

View File

@@ -35,10 +35,18 @@
#define mm_one_64 _mm_set1_epi64x( 1ULL )
#define mm_one_32 _mm_set1_epi32( 1UL )
#define mm_one_16 _mm_set1_epi16( 1U )
#define mm_one_8 _mm_set1_epi8( 1U )
// Constant minus 1
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
// Lane index, useful for byte rotate using shuffle
#define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL );
#define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL );
#define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
#define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
//
// Basic operations without equivalent SIMD intrinsic
@@ -327,6 +335,16 @@ inline __m128i mm_byteswap_16( __m128i x )
// Constant minus 1
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
// Lane index, useful for rotate using permutevar
#define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL );
#define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL );
#define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
#define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \
23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \
15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
//
// Basic operations without SIMD equivalent
@@ -1109,7 +1127,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
}
// Can't do it in place
inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len )
inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
@@ -1146,7 +1164,8 @@ inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len )
// likely of no use.
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
// bit_len must be multiple of 64
inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
// broken
inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
int bit_len )
{
uint32_t *d = (uint32_t*)dst;
@@ -1200,6 +1219,7 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
// bit_len == 1024
}
// not used
inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
{
uint32_t *d = (uint32_t*)dst;

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.9'
PACKAGE_STRING='cpuminer-opt 3.7.9'
PACKAGE_VERSION='3.7.10'
PACKAGE_STRING='cpuminer-opt 3.7.10'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.7.9
cpuminer-opt configure 3.7.10
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.9, which was
It was created by cpuminer-opt $as_me 3.7.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.7.9'
VERSION='3.7.10'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.7.9, which was
This file was extended by cpuminer-opt $as_me 3.7.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.7.9
cpuminer-opt config.status 3.7.10
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.9])
AC_INIT([cpuminer-opt], [3.7.10])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -346,6 +346,7 @@ static bool work_decode( const json_t *val, struct work *work )
work->targetdiff = target_to_diff( work->target );
// for api stats, on longpoll pools
stratum_diff = work->targetdiff;
work->sharediff = 0;
algo_gate.display_extra_data( work, &net_blocks );
return true;
}
@@ -755,6 +756,7 @@ static int share_result( int result, struct work *work, const char *reason )
uint32_t total_submits;
float rate;
char rate_s[8] = {0};
double sharediff = work ? work->sharediff : stratum.sharediff;
int i;
pthread_mutex_lock(&stats_lock);
@@ -814,6 +816,8 @@ static int share_result( int result, struct work *work, const char *reason )
sprintf(hr, "%.2f", hashrate );
}
if ( sharediff == 0 )
{
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s",
sres, ( result ? accepted_count : rejected_count ),
@@ -824,6 +828,20 @@ static int share_result( int result, struct work *work, const char *reason )
total_submits, rate_s, hc, hc_units, hr, hr_units,
(uint32_t)cpu_temp(0) );
#endif
}
else
{
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s",
sres, ( result ? accepted_count : rejected_count ),
total_submits, rate_s, sharediff, hr, hr_units );
#else
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s, %dC",
sres, ( result ? accepted_count : rejected_count ),
total_submits, rate_s, sharediff, hr, hr_units,
(uint32_t)cpu_temp(0) );
#endif
}
if (reason)
{
@@ -1026,6 +1044,7 @@ static bool submit_upstream_work( CURL *curl, struct work *work )
}
if ( have_stratum )
{
stratum.sharediff = work->sharediff;
algo_gate.build_stratum_request( req, work, &stratum );
if ( unlikely( !stratum_send_line( &stratum, req ) ) )
{
@@ -1569,7 +1588,7 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
@@ -2984,25 +3003,22 @@ bool check_cpu_capability ()
}
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
{
if ( sw_has_4way && algo_has_4way )
printf( "A CPU with AES and AVX2 is required to use 4way!\n" );
else if ( algo_has_avx2 )
printf( "A CPU with AES and AVX2 is required!\n" );
printf( "The SW build requires a CPU with AES and AVX2!\n" );
return false;
}
if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) )
if ( sw_has_avx && !cpu_has_avx )
{
printf( "A CPU with AES and AVX2 is required!\n" );
printf( "The SW build requires a CPU with AVX!\n" );
return false;
}
if ( sw_has_aes && algo_has_aes && !cpu_has_aes )
if ( sw_has_aes && !cpu_has_aes )
{
printf( "A CPU with AES is required!\n" );
printf( "The SW build requires a CPU with AES!\n" );
return false;
}
if ( sw_has_sha && algo_has_sha && !cpu_has_sha )
if ( sw_has_sha && !cpu_has_sha )
{
printf( "A CPU with SHA is required!\n" );
printf( "The SW build requires a CPU with SHA!\n" );
return false;
}
@@ -3187,6 +3203,9 @@ int main(int argc, char *argv[])
}
#endif
if ( num_cpus != opt_n_threads )
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
num_cpus, opt_n_threads );
if ( opt_affinity != -1 )
{
if ( num_cpus > 64 )

View File

@@ -736,7 +736,7 @@ Options:\n\
whirlpool\n\
whirlpoolx\n\
x11 Dash\n\
x11evo Revolvercoin\n\
x11evo Revolvercoin (XRE)\n\
x11gost sib (SibCoin)\n\
x13 X13\n\
x13sm3 hsr (Hshare)\n\