This commit is contained in:
Jay D Dee
2018-01-16 15:11:44 -05:00
parent bee78eac76
commit a90d75b8f5
77 changed files with 3408 additions and 1214 deletions

View File

@@ -46,8 +46,10 @@ cpuminer_SOURCES = \
algo/blake/sph_blake2b.c \ algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \ algo/blake/blake2b.c \
algo/blake/blake2s.c \ algo/blake/blake2s.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \ algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \ algo/blake/blakecoin.c \
algo/blake/blakecoin-4way.c \
algo/blake/decred-gate.c \ algo/blake/decred-gate.c \
algo/blake/decred.c \ algo/blake/decred.c \
algo/blake/decred-4way.c \ algo/blake/decred-4way.c \
@@ -99,13 +101,17 @@ cpuminer_SOURCES = \
algo/luffa/sse2/luffa_for_sse2.c \ algo/luffa/sse2/luffa_for_sse2.c \
algo/lyra2/lyra2.c \ algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \ algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
algo/lyra2/lyra2rev2.c \ algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2rev2-4way.c \
algo/lyra2/lyra2re.c \ algo/lyra2/lyra2re.c \
algo/lyra2/lyra2z-gate.c \ algo/lyra2/lyra2z-gate.c \
algo/lyra2/lyra2z.c \ algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \ algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \ algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \ algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/m7m.c \ algo/m7m.c \
algo/neoscrypt/neoscrypt.c \ algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \ algo/nist5/nist5-gate.c \
@@ -113,7 +119,9 @@ cpuminer_SOURCES = \
algo/nist5/nist5.c \ algo/nist5/nist5.c \
algo/nist5/zr5.c \ algo/nist5/zr5.c \
algo/pluck.c \ algo/pluck.c \
algo/quark/quark-gate.c \
algo/quark/quark.c \ algo/quark/quark.c \
algo/quark/quark-4way.c \
algo/qubit/qubit.c \ algo/qubit/qubit.c \
algo/qubit/deep.c \ algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \ algo/ripemd/sph_ripemd.c \
@@ -140,9 +148,8 @@ cpuminer_SOURCES = \
algo/skein/skein2-4way.c \ algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \ algo/skein/skein2-gate.c \
algo/sm3/sm3.c \ algo/sm3/sm3.c \
algo/sm3/sm3-hash-4way.c \
algo/tiger/sph_tiger.c \ algo/tiger/sph_tiger.c \
algo/timetravel.c \
algo/timetravel10.c \
algo/whirlpool/sph_whirlpool.c \ algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \ algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \ algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +168,16 @@ cpuminer_SOURCES = \
algo/x11/tribus-gate.c \ algo/x11/tribus-gate.c \
algo/x11/tribus.c \ algo/x11/tribus.c \
algo/x11/tribus-4way.c \ algo/x11/tribus-4way.c \
algo/x11/timetravel-gate.c \
algo/x11/timetravel.c \
algo/x11/timetravel-4way.c \
algo/x11/timetravel10-gate.c \
algo/x11/timetravel10.c \
algo/x11/timetravel10-4way.c \
algo/x11/fresh.c \ algo/x11/fresh.c \
algo/x11/x11evo.c \ algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x13/x13-gate.c \ algo/x13/x13-gate.c \
algo/x13/x13.c \ algo/x13/x13.c \
algo/x13/x13-4way.c \ algo/x13/x13-4way.c \

View File

@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log Change Log
---------- ----------
v3.7.10
4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
x11evo, blakecoin.
Faster x13sm3 (hsr).
Added share difficulty to accepted message.
v3.7.9 v3.7.9
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z. Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.

View File

@@ -1,31 +1,22 @@
#include "blake-gate.h" #include "blake-gate.h"
#include "sph_blake.h"
#if defined (__AVX__)
#include "blake-hash-4way.h" #include "blake-hash-4way.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
#include <memory.h> #include <memory.h>
#if defined (BLAKE_4WAY) blake256r14_4way_context blake_ctx;
void blakehash_4way(void *state, const void *input) void blakehash_4way(void *state, const void *input)
{ {
uint32_t vhash[4*4] __attribute__ ((aligned (64))); uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32))); blake256r14_4way_context ctx;
uint32_t hash1[4] __attribute__ ((aligned (32))); memcpy( &ctx, &blake_ctx, sizeof ctx );
uint32_t hash2[4] __attribute__ ((aligned (32))); blake256r14_4way( &ctx, input + (64<<2), 16 );
uint32_t hash3[4] __attribute__ ((aligned (32))); blake256r14_4way_close( &ctx, vhash );
blake256_4way_context ctx; mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash1, 32 );
memcpy( state+96, hash1, 32 );
} }
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7]; uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20]; uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce; uint32_t n = first_nonce;
uint32_t *nonces = work->nonces; uint32_t *nonces = work->nonces;
bool *found = work->nfound; bool *found = work->nfound;
int num_found = 0; int num_found = 0;
// if (opt_benchmark) if (opt_benchmark)
// HTarget = 0x7f; HTarget = 0x7f;
// we need big endian data... // we need big endian data...
swab32_array( edata, pdata, 20 ); swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r14_4way_init( &blake_ctx );
blake256r14_4way( &blake_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4 uint32_t *noncep = vdata + 76; // 19*4
do { do {
found[0] = found[1] = found[2] = found[3] = false; found[0] = found[1] = found[2] = found[3] = false;
@@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
blakehash_4way( hash, vdata ); blakehash_4way( hash, vdata );
if ( hash[7] == 0 ) if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{ {
found[0] = true; found[0] = true;
num_found++; num_found++;
nonces[0] = n; nonces[0] = n;
pdata[19] = n; pdata[19] = n;
work_set_target_ratio( work, hash );
} }
} if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
if ( (hash+8)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{ {
found[1] = true; found[1] = true;
num_found++; num_found++;
nonces[1] = n+1; nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
} }
} if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
if ( (hash+16)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{ {
found[2] = true; found[2] = true;
num_found++; num_found++;
nonces[2] = n+2; nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
} }
} if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
if ( (hash+24)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{ {
found[3] = true; found[3] = true;
num_found++; num_found++;
nonces[3] = n+3; nonces[3] = n+3;
} work_set_target_ratio( work, hash+24 );
} }
n += 4; n += 4;
*hashes_done = n - first_nonce + 1;
} while ( (num_found == 0) && (n < max_nonce) } while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart ); && !work_restart[thr_id].restart );

View File

@@ -491,14 +491,9 @@ do { \
(state)->T1 = T1; \ (state)->T1 = T1; \
} while (0) } while (0)
//#define BLAKE32_ROUNDS 8
#ifndef BLAKE32_ROUNDS
#define BLAKE32_ROUNDS 14
#endif
#if SPH_COMPACT_BLAKE_32 #if SPH_COMPACT_BLAKE_32
#define COMPRESS32_4WAY do { \ #define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \ __m128i M[16]; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \ __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \ __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -539,7 +534,7 @@ do { \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \ M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \ M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \ M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \ for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \ ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \ H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \ _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -563,7 +558,8 @@ do { \
// current impl // current impl
#define COMPRESS32_4WAY do { \ #define COMPRESS32_4WAY( rounds ) \
do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \ __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \ __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \ __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -580,14 +576,10 @@ do { \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \ V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \ VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \ VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \ VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \ VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \ M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \ M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \ M2 = mm_byteswap_32( *(buf+2) ); \
@@ -612,7 +604,8 @@ do { \
ROUND_S_4WAY(5); \ ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \ ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \ ROUND_S_4WAY(7); \
if (BLAKE32_ROUNDS == 14) { \ if (rounds == 14) \
{ \
ROUND_S_4WAY(8); \ ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \ ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \ ROUND_S_4WAY(0); \
@@ -620,22 +613,14 @@ do { \
ROUND_S_4WAY(2); \ ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \ ROUND_S_4WAY(3); \
} \ } \
H0 = _mm_xor_si128( _mm_xor_si128( \ H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
_mm_xor_si128( V8, V0 ), S0 ), H0 ); \ H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H1 = _mm_xor_si128( _mm_xor_si128( \ H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
_mm_xor_si128( V9, V1 ), S1 ), H1 ); \ H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H2 = _mm_xor_si128( _mm_xor_si128( \ H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
_mm_xor_si128( VA, V2 ), S2 ), H2 ); \ H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H3 = _mm_xor_si128( _mm_xor_si128( \ H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
_mm_xor_si128( VB, V3 ), S3 ), H3 ); \ H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
H4 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0) } while (0)
#endif #endif
@@ -832,7 +817,7 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
static void static void
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv, blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt) const sph_u32 *salt, int rounds )
{ {
int i; int i;
for ( i = 0; i < 8; i++ ) for ( i = 0; i < 8; i++ )
@@ -841,6 +826,7 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
sc->S[i] = _mm_set1_epi32( salt[i] ); sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0; sc->T0 = sc->T1 = 0;
sc->ptr = 0; sc->ptr = 0;
sc->rounds = rounds;
} }
static void static void
@@ -878,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
{ {
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 ) if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1); T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY; COMPRESS32_4WAY( sc->rounds );
ptr = 0; ptr = 0;
} }
} }
@@ -1079,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,
#endif #endif
// default 14 rounds, backward copatibility
void void
blake256_4way_init(void *cc) blake256_4way_init(void *cc)
{ {
blake32_4way_init(cc, IV256, salt_zero_small); blake32_4way_init( cc, IV256, salt_zero_small, 14 );
} }
void void
@@ -1094,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
void void
blake256_4way_close(void *cc, void *dst) blake256_4way_close(void *cc, void *dst)
{ {
blake256_4way_addbits_and_close(cc, 0, 0, dst); blake32_4way_close(cc, 0, 0, dst, 8);
}
// 14 rounds blake, decred
void blake256r14_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 14 );
} }
void void
blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) blake256r14_4way(void *cc, const void *data, size_t len)
{ {
blake32_4way_close(cc, ub, n, dst, 8); blake32_4way(cc, data, len);
}
void
blake256r14_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
// 8 rounds blakecoin, vanilla
void blake256r8_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_small, 8 );
}
void
blake256r8_4way(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
void
blake256r8_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
} }
#if defined (__AVX2__) #if defined (__AVX2__)

View File

@@ -35,7 +35,9 @@
*/ */
#ifndef __BLAKE_HASH_4WAY__ #ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY___ #define __BLAKE_HASH_4WAY__
#ifdef __AVX__
#ifdef __cplusplus #ifdef __cplusplus
extern "C"{ extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
#include "algo/sha/sph_types.h" #include "algo/sha/sph_types.h"
#include "avxdefs.h" #include "avxdefs.h"
/**
* Output size (in bits) for BLAKE-256.
*/
#define SPH_SIZE_blake256 256 #define SPH_SIZE_blake256 256
#if SPH_64
/**
* Output size (in bits) for BLAKE-512.
*/
#define SPH_SIZE_blake512 512 #define SPH_SIZE_blake512 512
#endif
#ifdef __AVX__
typedef struct { typedef struct {
__m128i buf[16] __attribute__ ((aligned (64))); __m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8]; __m128i H[8];
__m128i S[4]; __m128i S[4];
size_t ptr; size_t ptr;
sph_u32 T0, T1; sph_u32 T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context; } blake_4way_small_context;
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context; typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc); void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len); void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst); void blake256_4way_close(void *cc, void *dst);
void blake256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif // 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way(void *cc, const void *data, size_t len);
void blake256r14_4way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_4way_small_context blake256r8_4way_context;
void blake256r8_4way_init(void *cc);
void blake256r8_4way(void *cc, const void *data, size_t len);
void blake256r8_4way_close(void *cc, void *dst);
#ifdef __AVX2__ #ifdef __AVX2__
@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
#endif #endif
#endif #endif
#endif

106
algo/blake/blakecoin-4way.c Normal file
View File

@@ -0,0 +1,106 @@
#include "blakecoin-gate.h"
#if defined (__AVX__)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
blake256r8_4way_context blakecoin_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r8_4way_init( &blakecoin_ctx );
blake256r8_4way( &blakecoin_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
blakecoin_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
// workaround to prevent flood of hash reports when nonce range exhasuted
// and thread is spinning waiting for new work
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
sleep(1);
}
return num_found;
}
#endif

View File

@@ -0,0 +1,71 @@
#include "blakecoin-gate.h"
#include <memory.h>
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
// return 0x3fffffLL;
}
// Blakecoin 4 way hashes so fast it runs out of nonces.
// This is an attempt to solve this but the result may be
// to rehash old nonces until new work is received.
void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
//
// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
// algo_gate.stratum_gen_work( &stratum, g_work );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|| ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) && clean_job )
/*
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
*/
{
work_free( work );
work_copy( work, g_work );
*nonceptr = 0xffffffffU / opt_n_threads * thr_id;
if ( opt_randomize )
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
// try incrementing the xnonce to chsnge the data
// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
}
else
++(*nonceptr);
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_4WAY)
// four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
// gate->get_new_work = (void*)&bc4w_get_new_work;
// blakecoin_4way_init( &blake_4way_init_ctx );
#else
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
// blakecoin_init( &blake_init_ctx );
#endif
gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}

View File

@@ -0,0 +1,21 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#define BLAKECOIN_4WAY
#endif
#if defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void blakecoinhash( void *state, const void *input );
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h" #include "blakecoin-gate.h"
#define BLAKE32_ROUNDS 8 #define BLAKE32_ROUNDS 8
#include "sph_blake.h" #include "sph_blake.h"
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root ); SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
} }
*/ */
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 () int64_t blakecoin_get_max64 ()
{ {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true; return true;
} }
*/

View File

@@ -1,5 +1,4 @@
#include "decred-gate.h" #include "decred-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h" #include "blake-hash-4way.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
@@ -9,7 +8,6 @@
#if defined (DECRED_4WAY) #if defined (DECRED_4WAY)
static __thread blake256_4way_context blake_mid; static __thread blake256_4way_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input ) void decred_hash_4way( void *state, const void *input )
{ {
@@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input )
uint32_t hash1[8] __attribute__ ((aligned (32))); uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32))); uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32))); uint32_t hash3[8] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 ); void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN; int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len ); blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash ); blake256_4way_close( &ctx, vhash );
/* mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash );
*/
/*
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
*/
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
printf(" hash mismatch, i = %u\n",i);
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
*(hash+2), *(hash+3) );
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
*(hash0+2), *(hash0+3) );
printf("\n");
*/
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy( state, hash, 32 );
} }
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -79,11 +41,11 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
bool *found = work->nfound; bool *found = work->nfound;
int num_found = 0; int num_found = 0;
ctx_midstate_done = false; // copy to buffer guaranteed to be aligned.
memcpy( edata, pdata, 180 ); memcpy( edata, pdata, 180 );
// use the old way until new way updated for size. // use the old way until new way updated for size.
mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 ); mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
blake256_4way_init( &blake_mid ); blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
nonces[0] = n; nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n; pdata[DECRED_NONCE_INDEX] = n;
} }
/*
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{ {
printf("found 1\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+8 ); work_set_target_ratio( work, hash+8 );
found[1] = true; found[1] = true;
num_found++; num_found++;
nonces[1] = n+1; nonces[1] = n+1;
} }
*/
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{ {
work_set_target_ratio( work, hash+16 ); work_set_target_ratio( work, hash+16 );
@@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[
num_found++; num_found++;
nonces[2] = n+2; nonces[2] = n+2;
} }
/*
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{ {
printf("found 3\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+24 ); work_set_target_ratio( work, hash+24 );
found[3] = true; found[3] = true;
num_found++; num_found++;
nonces[3] = n+3; nonces[3] = n+3;
} }
*/ n += 4;
n += 2;
// n += 4;
} while ( (num_found == 0) && (n < max_nonce) } while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart ); && !work_restart[thr_id].restart );

View File

@@ -1,4 +1,7 @@
#include "pentablake-gate.h" #include "pentablake-gate.h"
#ifdef __AVX2__
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
@@ -9,8 +12,6 @@
//#define DEBUG_ALGO //#define DEBUG_ALGO
#ifdef PENTABLAKE_4WAY
extern void pentablakehash_4way( void *output, const void *input ) extern void pentablakehash_4way( void *output, const void *input )
{ {
unsigned char _ALIGN(32) hash[128]; unsigned char _ALIGN(32) hash[128];

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <stdint.h> #include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__) #if defined(FOUR_WAY) && defined(__AVX2__)
#define PENTABLAKE_4WAY #define PENTABLAKE_4WAY
#endif #endif

View File

@@ -41,19 +41,13 @@
extern "C"{ extern "C"{
#endif #endif
//#include "sph_bmw.h"
//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
#define SPH_SMALL_FOOTPRINT_BMW 1
//#endif
#ifdef _MSC_VER #ifdef _MSC_VER
#pragma warning (disable: 4146) #pragma warning (disable: 4146)
#endif #endif
//#undef SPH_ROTL64 #define LPAR (
//#define SPH_ROTL64(x,n) (((x) << (n)) | ((x) >> (64 - (n))))
//#define SPH_ROTL64(x,n) mm256_rotl_64(x,n) // BMW256
static const sph_u32 IV256[] = { static const sph_u32 IV256[] = {
SPH_C32(0x40414243), SPH_C32(0x44454647), SPH_C32(0x40414243), SPH_C32(0x44454647),
@@ -66,8 +60,7 @@ static const sph_u32 IV256[] = {
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
}; };
#if SPH_64 // BMW512
static const sph_u64 IV512[] = { static const sph_u64 IV512[] = {
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
@@ -79,69 +72,108 @@ static const sph_u64 IV512[] = {
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
}; };
#endif // BMW256
#define XCAT(x, y) XCAT_(x, y) #define ss0(x) \
#define XCAT_(x, y) x ## y _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm_rotl_32( (x), 4), \
mm_rotl_32( (x), 19) ) )
#define LPAR ( #define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 8), \
mm_rotl_32( (x), 23) ) )
/* #define ss2(x) \
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) _mm_slli_epi32( (x), 1) ), \
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ _mm_xor_si128( mm_rotl_32( (x), 12), \
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) mm_rotl_32( (x), 25) ) )
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
#define ss4(x) (((x) >> 1) ^ (x))
#define ss5(x) (((x) >> 2) ^ (x))
#define rs1(x) SPH_ROTL32(x, 3)
#define rs2(x) SPH_ROTL32(x, 7)
#define rs3(x) SPH_ROTL32(x, 13)
#define rs4(x) SPH_ROTL32(x, 16)
#define rs5(x) SPH_ROTL32(x, 19)
#define rs6(x) SPH_ROTL32(x, 23)
#define rs7(x) SPH_ROTL32(x, 27)
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) #define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 15), \
mm_rotl_32( (x), 29) ) )
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ #define ss4(x) \
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
#define expand1s_inner(qf, mf, hf, i16, \ #define ss5(x) \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \ _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
i9, i10, i11, i12, i13, i14, i15, \
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define expand1s(qf, mf, hf, i16) \ #define rs1(x) mm_rotl_32( x, 3 )
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) #define rs2(x) mm_rotl_32( x, 7 )
#define expand1s_(qf, mf, hf, i16, ix, iy) \ #define rs3(x) mm_rotl_32( x, 13 )
expand1s_inner LPAR qf, mf, hf, i16, ix, iy) #define rs4(x) mm_rotl_32( x, 16 )
#define rs5(x) mm_rotl_32( x, 19 )
#define rs6(x) mm_rotl_32( x, 23 )
#define rs7(x) mm_rotl_32( x, 27 )
#define expand2s_inner(qf, mf, hf, i16, \ #define rol_off_32( M, j, off ) \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \ mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
i9, i10, i11, i12, i13, i14, i15, \ ( ( (j) + (off) ) & 0xF ) + 1 )
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define expand2s(qf, mf, hf, i16) \ #define add_elt_s( M, H, j ) \
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) _mm_xor_si128( \
#define expand2s_(qf, mf, hf, i16, ix, iy) \ _mm_add_epi32( \
expand2s_inner LPAR qf, mf, hf, i16, ix, iy) _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
*/ rol_off_32( M, j, 3 ) ), \
#if SPH_64 rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-16 ] ), \
ss2( qt[ (i)-15 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-14 ] ), \
ss0( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-12 ] ), \
ss2( qt[ (i)-11 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-10 ] ), \
ss0( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
ss2( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
ss0( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
ss2( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
ss0( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
_mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
_mm_add_epi64( \
_mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
_mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
ss5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
// BMW512
#define sb0(x) \ #define sb0(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \ _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
@@ -181,18 +213,18 @@ static const sph_u64 IV512[] = {
#define rb6(x) mm256_rotl_64( x, 43 ) #define rb6(x) mm256_rotl_64( x, 43 )
#define rb7(x) mm256_rotl_64( x, 53 ) #define rb7(x) mm256_rotl_64( x, 53 )
#define rol_off( M, j, off ) \ #define rol_off_64( M, j, off ) \
mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \ mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 15 ) + 1 ) ( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_b( M, H, j ) \ #define add_elt_b( M, H, j ) \
_mm256_xor_si256( \ _mm256_xor_si256( \
_mm256_add_epi64( \ _mm256_add_epi64( \
_mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \ _mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
rol_off( M, j, 3 ) ), \ rol_off_64( M, j, 3 ) ), \
rol_off( M, j, 10 ) ), \ rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \ _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 15 ] ) H[ ( (j)+7 ) & 0xF ] )
#define expand1b( qt, M, H, i ) \ #define expand1b( qt, M, H, i ) \
_mm256_add_epi64( \ _mm256_add_epi64( \
@@ -241,132 +273,301 @@ static const sph_u64 IV512[] = {
sb5( qt[ (i)- 1 ] ) ) ) ) ), \ sb5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_b( M, H, (i)-16 ) ) add_elt_b( M, H, (i)-16 ) )
#endif // BMW256
/* #define Ws0 \
#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \ _mm_add_epi32( \
((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ _mm_add_epi32( \
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) _mm_add_epi32( \
*/ _mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[14], H[14] ) )
/* #define Ws1 \
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) _mm_sub_epi32( \
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) _mm_add_epi32( \
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) _mm_add_epi32( \
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) _mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) _mm_xor_si128( M[ 8], H[ 8] ) ), \
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) _mm_xor_si128( M[11], H[11] ) ), \
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) _mm_xor_si128( M[14], H[14] ) ), \
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) _mm_xor_si128( M[15], H[15] ) )
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
#if SPH_SMALL_FOOTPRINT_BMW #define Ws2 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \ #define Ws3 \
unsigned u; \ _mm_add_epi32( \
sph_u32 Ws[16]; \ _mm_sub_epi32( \
Ws[ 0] = Ws0; \ _mm_add_epi32( \
Ws[ 1] = Ws1; \ _mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
Ws[ 2] = Ws2; \ _mm_xor_si128( M[ 1], H[ 1] ) ), \
Ws[ 3] = Ws3; \ _mm_xor_si128( M[ 8], H[ 8] ) ), \
Ws[ 4] = Ws4; \ _mm_xor_si128( M[10], H[10] ) ), \
Ws[ 5] = Ws5; \ _mm_xor_si128( M[13], H[13] ) )
Ws[ 6] = Ws6; \
Ws[ 7] = Ws7; \
Ws[ 8] = Ws8; \
Ws[ 9] = Ws9; \
Ws[10] = Ws10; \
Ws[11] = Ws11; \
Ws[12] = Ws12; \
Ws[13] = Ws13; \
Ws[14] = Ws14; \
Ws[15] = Ws15; \
for (u = 0; u < 15; u += 5) { \
qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
} \
qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
} while (0)
#define MAKE_Qbs do { \ #define Ws4 \
qt[16] = expand1s(Qs, M, H, 16); \ _mm_sub_epi32( \
qt[17] = expand1s(Qs, M, H, 17); \ _mm_sub_epi32( \
qt[18] = expand2s(Qs, M, H, 18); \ _mm_add_epi32( \
qt[19] = expand2s(Qs, M, H, 19); \ _mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
qt[20] = expand2s(Qs, M, H, 20); \ _mm_xor_si128( M[ 2], H[ 2] ) ), \
qt[21] = expand2s(Qs, M, H, 21); \ _mm_xor_si128( M[ 9], H[ 9] ) ), \
qt[22] = expand2s(Qs, M, H, 22); \ _mm_xor_si128( M[11], H[11] ) ), \
qt[23] = expand2s(Qs, M, H, 23); \ _mm_xor_si128( M[14], H[14] ) )
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#else #define Ws5 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \ #define Ws6 \
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ _mm_add_epi32( \
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ _mm_sub_epi32( \
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ _mm_sub_epi32( \
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ _mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ _mm_xor_si128( M[ 0], H[ 0] ) ), \
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ _mm_xor_si128( M[ 3], H[ 3] ) ), \
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ _mm_xor_si128( M[11], H[11] ) ), \
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ _mm_xor_si128( M[13], H[13] ) )
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
} while (0)
#define MAKE_Qbs do { \ #define Ws7 \
qt[16] = expand1s(Qs, M, H, 16); \ _mm_sub_epi32( \
qt[17] = expand1s(Qs, M, H, 17); \ _mm_sub_epi32( \
qt[18] = expand2s(Qs, M, H, 18); \ _mm_sub_epi32( \
qt[19] = expand2s(Qs, M, H, 19); \ _mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
qt[20] = expand2s(Qs, M, H, 20); \ _mm_xor_si128( M[ 4], H[ 4] ) ), \
qt[21] = expand2s(Qs, M, H, 21); \ _mm_xor_si128( M[ 5], H[ 5] ) ), \
qt[22] = expand2s(Qs, M, H, 22); \ _mm_xor_si128( M[12], H[12] ) ), \
qt[23] = expand2s(Qs, M, H, 23); \ _mm_xor_si128( M[14], H[14] ) )
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#endif #define Ws8 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qs do { \ #define Ws9 \
MAKE_Qas; \ _mm_add_epi32( \
MAKE_Qbs; \ _mm_sub_epi32( \
} while (0) _mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#define Qs(j) (qt[j]) #define Ws10 \
*/ _mm_add_epi32( \
#if SPH_64 _mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define Ws11 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) )
#define Ws12 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[10], H[10] ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[11], H[11] ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[12], H[12] ) )
#define Ws15 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[13], H[13] ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
{
__m128i qt[32], xl, xh; \
qt[ 0] = ss0( Ws0 ) + H[ 1];
qt[ 1] = ss1( Ws1 ) + H[ 2];
qt[ 2] = ss2( Ws2 ) + H[ 3];
qt[ 3] = ss3( Ws3 ) + H[ 4];
qt[ 4] = ss4( Ws4 ) + H[ 5];
qt[ 5] = ss0( Ws5 ) + H[ 6];
qt[ 6] = ss1( Ws6 ) + H[ 7];
qt[ 7] = ss2( Ws7 ) + H[ 8];
qt[ 8] = ss3( Ws8 ) + H[ 9];
qt[ 9] = ss4( Ws9 ) + H[10];
qt[10] = ss0( Ws10) + H[11];
qt[11] = ss1( Ws11) + H[12];
qt[12] = ss2( Ws12) + H[13];
qt[13] = ss3( Ws13) + H[14];
qt[14] = ss4( Ws14) + H[15];
qt[15] = ss0( Ws15) + H[ 0];
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
qt[19] = expand2s( qt, M, H, 19 );
qt[20] = expand2s( qt, M, H, 20 );
qt[21] = expand2s( qt, M, H, 21 );
qt[22] = expand2s( qt, M, H, 22 );
qt[23] = expand2s( qt, M, H, 23 );
qt[24] = expand2s( qt, M, H, 24 );
qt[25] = expand2s( qt, M, H, 25 );
qt[26] = expand2s( qt, M, H, 26 );
qt[27] = expand2s( qt, M, H, 27 );
qt[28] = expand2s( qt, M, H, 28 );
qt[29] = expand2s( qt, M, H, 29 );
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
_mm_xor_si128( qt[18], qt[19] ) ),
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
_mm_xor_si128( qt[22], qt[23] ) ) );
xh = _mm_xor_si128( xl,
_mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
_mm_xor_si128( qt[26], qt[27] ) ),
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
_mm_xor_si128( qt[30], qt[31] ) )));
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
}
// BMW512
#define Wb0 \ #define Wb0 \
_mm256_add_epi64( \ _mm256_add_epi64( \
@@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[29] = expand2b( qt, M, H, 29 ); qt[29] = expand2b( qt, M, H, 29 );
qt[30] = expand2b( qt, M, H, 30 ); qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 ); qt[31] = expand2b( qt, M, H, 31 );
xl = _mm256_xor_si256( xl = _mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
_mm256_xor_si256( qt[18], qt[19] ) ), _mm256_xor_si256( qt[18], qt[19] ) ),
@@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[26], qt[27] ) ), _mm256_xor_si256( qt[26], qt[27] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ), _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
_mm256_xor_si256( qt[30], qt[31] ) ))); _mm256_xor_si256( qt[30], qt[31] ) )));
dH[ 0] = _mm256_add_epi64( dH[ 0] = _mm256_add_epi64(
_mm256_xor_si256( M[0], _mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ), _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
@@ -657,85 +860,65 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[22], qt[15] ) ) ); _mm256_xor_si256( qt[22], qt[15] ) ) );
} }
#endif // 64 // BMW256
//#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH) static const __m128i final_s[16] =
/*
static void
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
{ {
#define M(x) sph_dec32le_aligned(data + 4 * (x)) { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
#define H(x) (h[x]) { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
#define dH(x) (dh[x]) { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
FOLDs; { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
#undef M { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
#undef H { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
#undef dH { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
} { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
static const sph_u32 final_s[16] = { { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2), { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5), { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8), { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab), { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
SPH_C32(0xaaaaaaaf)
}; };
static void static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv) bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
{ {
memcpy(sc->H, iv, sizeof sc->H); for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
sc->ptr = 0; sc->ptr = 0;
#if SPH_64
sc->bit_count = 0; sc->bit_count = 0;
#else
sc->bit_count_high = 0;
sc->bit_count_low = 0;
#endif
} }
static void static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{ {
unsigned char *buf; __m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
size_t ptr; size_t ptr;
sph_u32 htmp[16]; const int buf_size = 64; // bytes of one lane, compatible with len
sph_u32 *h1, *h2;
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->bit_count += (sph_u64)len << 3; sc->bit_count += (sph_u64)len << 3;
#else
tmp = sc->bit_count_low;
sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
if (sc->bit_count_low < tmp)
sc->bit_count_high ++;
sc->bit_count_high += len >> 29;
#endif
buf = sc->buf; buf = sc->buf;
ptr = sc->ptr; ptr = sc->ptr;
h1 = sc->H; h1 = sc->H;
h2 = htmp; h2 = htmp;
while (len > 0) { while ( len > 0 )
{
size_t clen; size_t clen;
clen = buf_size - ptr;
clen = (sizeof sc->buf) - ptr;
if ( clen > len ) if ( clen > len )
clen = len; clen = len;
memcpy(buf + ptr, data, clen); memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
data = (const unsigned char *)data + clen; vdata += ( clen >> 2 );
len -= clen; len -= clen;
ptr += clen; ptr += clen;
if (ptr == sizeof sc->buf) { if ( ptr == buf_size )
sph_u32 *ht; {
__m128i *ht;
compress_small( buf, h1, h2 ); compress_small( buf, h1, h2 );
ht = h1; ht = h1;
h1 = h2; h1 = h2;
@@ -745,49 +928,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
} }
sc->ptr = ptr; sc->ptr = ptr;
if ( h1 != sc->H ) if ( h1 != sc->H )
memcpy(sc->H, h1, sizeof sc->H); memcpy_128( sc->H, h1, 16 );
} }
static void static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32) void *dst, size_t out_size_w32)
{ {
unsigned char *buf, *out; __m128i *buf;
__m128i h1[16], h2[16], *h;
size_t ptr, u, v; size_t ptr, u, v;
unsigned z; unsigned z;
sph_u32 h1[16], h2[16], *h; const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf; buf = sc->buf;
ptr = sc->ptr; ptr = sc->ptr;
z = 0x80 >> n; z = 0x80 >> n;
buf[ptr ++] = ((ub & -z) | z) & 0xFF; buf[ ptr>>2 ] = _mm_set1_epi32( z );
ptr += 4;
h = sc->H; h = sc->H;
if (ptr > (sizeof sc->buf) - 8) {
memset(buf + ptr, 0, (sizeof sc->buf) - ptr); // assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 ); compress_small( buf, h, h1 );
ptr = 0; ptr = 0;
h = h1; h = h1;
} }
memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 );
#if SPH_64 buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
SPH_T64(sc->bit_count + n));
#else
sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
sc->bit_count_low + n);
sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
SPH_T32(sc->bit_count_high));
#endif
compress_small( buf, h, h2 ); compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ ) for ( u = 0; u < 16; u ++ )
sph_enc32le_aligned(buf + 4 * u, h2[u]); buf[u] = h2[u];
compress_small( buf, final_s, h1 ); compress_small( buf, final_s, h1 );
out = dst;
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
sph_enc32le(out + 4 * u, h1[v]); casti_m128i( dst, u ) = h1[v];
} }
*/
#if SPH_64 // BMW512
static const __m256i final_b[16] = static const __m256i final_b[16] =
{ {
@@ -908,33 +1087,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
casti_m256i(dst,u) = h1[v]; casti_m256i(dst,u) = h1[v];
} }
#endif // BMW256
void void
bmw256_4way_init(void *cc) bmw256_4way_init(void *cc)
{ {
// bmw32_4way_init(cc, IV256); bmw32_4way_init(cc, IV256);
} }
void void
bmw256_4way(void *cc, const void *data, size_t len) bmw256_4way(void *cc, const void *data, size_t len)
{ {
// bmw32_4way(cc, data, len); bmw32_4way(cc, data, len);
} }
void void
bmw256_4way_close(void *cc, void *dst) bmw256_4way_close(void *cc, void *dst)
{ {
// bmw256_4way_addbits_and_close(cc, 0, 0, dst); bmw256_4way_addbits_and_close(cc, 0, 0, dst);
} }
void void
bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{ {
// bmw32_4way_close(cc, ub, n, dst, 8); bmw32_4way_close(cc, ub, n, dst, 8);
} }
#if SPH_64 // BMW512
void void
bmw512_4way_init(void *cc) bmw512_4way_init(void *cc)
@@ -960,10 +1139,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
bmw64_4way_close(cc, ub, n, dst, 8); bmw64_4way_close(cc, ub, n, dst, 8);
} }
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif // __AVX2__

View File

@@ -46,94 +46,37 @@ extern "C"{
#include "algo/sha/sph_types.h" #include "algo/sha/sph_types.h"
#include "avxdefs.h" #include "avxdefs.h"
/**
* Output size (in bits) for BMW-224.
*/
#define SPH_SIZE_bmw224 224
/**
* Output size (in bits) for BMW-256.
*/
#define SPH_SIZE_bmw256 256 #define SPH_SIZE_bmw256 256
#if SPH_64
/**
* Output size (in bits) for BMW-384.
*/
#define SPH_SIZE_bmw384 384
/**
* Output size (in bits) for BMW-512.
*/
#define SPH_SIZE_bmw512 512 #define SPH_SIZE_bmw512 512
#endif
/**
* This structure is a context for BMW-224 and BMW-256 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct { typedef struct {
#ifndef DOXYGEN_IGNORE __m128i buf[64];
unsigned char buf[64]; /* first field, for alignment */ __m128i H[16];
size_t ptr; size_t ptr;
sph_u32 H[16]; sph_u32 bit_count; // assume bit_count fits in 32 bits
#if SPH_64
sph_u64 bit_count;
#else
sph_u32 bit_count_high, bit_count_low;
#endif
#endif
} bmw_4way_small_context; } bmw_4way_small_context;
typedef bmw_4way_small_context bmw256_4way_context; typedef bmw_4way_small_context bmw256_4way_context;
#if SPH_64
/**
* This structure is a context for BMW-384 and BMW-512 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct { typedef struct {
#ifndef DOXYGEN_IGNORE
__m256i buf[16]; __m256i buf[16];
__m256i H[16]; __m256i H[16];
// unsigned char buf[128]; /* first field, for alignment */
size_t ptr; size_t ptr;
// sph_u64 H[16];
sph_u64 bit_count; sph_u64 bit_count;
#endif
} bmw_4way_big_context; } bmw_4way_big_context;
typedef bmw_4way_big_context bmw512_4way_context; typedef bmw_4way_big_context bmw512_4way_context;
#endif
void bmw256_4way_init(void *cc); void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len); void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst); void bmw256_4way_close(void *cc, void *dst);
void bmw256_addbits_and_close( void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst); void *cc, unsigned ub, unsigned n, void *dst);
#if SPH_64
void bmw512_4way_init(void *cc); void bmw512_4way_init(void *cc);
void bmw512_4way(void *cc, const void *data, size_t len); void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
#endif #endif
#endif #endif
#endif

View File

@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash0[8*4] __attribute__ ((aligned (64))); uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64))); uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i mask, mask0, mask1;
__m256i* vh = (__m256i*)vhash; __m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0; __m256i* vhA = (__m256i*)vhashA;
__m256i* vh1 = (__m256i*)vhash1; __m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
blake512_4way_context ctx_blake; blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl; hashState_groestl ctx_groestl;
@@ -40,32 +40,13 @@ void jha_hash_4way( void *out, const void *input )
keccak512_4way( &ctx_keccak, input, 80 ); keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash ); keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop // Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ ) for ( int round = 0; round < 3; round++ )
{ {
// select next function based on bit 0 of previous hash. vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
// Specutively execute both functions and use mask to vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
// select results from correct function for each lane.
// hash = mask : vhash0 ? vhash1
mask = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
// second version
// mask0 = mask
// mask1 = mm256_not( mask );
// first version
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) vs skein
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0, update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 ); (char*)hash0, 512 );
@@ -78,67 +59,28 @@ void jha_hash_4way( void *out, const void *input )
init_groestl( &ctx_groestl, 64 ); init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3, update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 ); (char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
skein512_4way_init( &ctx_skein ); skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 ); skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash1 ); skein512_4way_close( &ctx_skein, vhashB );
// merge vectored hash
for ( int i = 0; i < 8; i++ ) for ( int i = 0; i < 8; i++ )
{ vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
// blend should be faster
vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
// second version
// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
// _mm256_and_si256( vh1[i], mask1 ) );
// first version
/*
vh0[i] = _mm256_maskload_epi64(
vhash0 + i*4, mm256_not( mask ) );
vh1[i] = _mm256_maskload_epi64(
vhash1 + i*4, mask );
vh[i] = _mm256_or_si256( vh0[i], vh1[i] );
*/
}
// blake v jh
blake512_4way_init( &ctx_blake ); blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 ); blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhash0 ); blake512_4way_close( &ctx_blake, vhashA );
jh512_4way_init( &ctx_jh ); jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 ); jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash1 ); jh512_4way_close( &ctx_jh, vhashB );
// merge hash
for ( int i = 0; i < 8; i++ ) for ( int i = 0; i < 8; i++ )
{ vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
} }
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 ); mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
} }
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0 0
}; };
// we need bigendian data...
for ( int i=0; i < 19; i++ ) for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] ); be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata; uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );
// keccak512_4way( &jha_kec_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ ) for ( int m = 0; m < 6; m++ )
{ {
if ( Htarg <= htmax[m] ) if ( Htarg <= htmax[m] )
@@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep3, n+3 ); be32enc( noncep3, n+3 );
jha_hash_4way( hash, vdata ); jha_hash_4way( hash, vdata );
pdata[19] = n; pdata[19] = n;
if ( ( !(hash[7] & mask) ) if ( ( !(hash[7] & mask) )
@@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
n += 4; n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) } while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart ); && !work_restart[thr_id].restart );
break; break;
} }
} }
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return num_found; return num_found;
} }

128
algo/lyra2/lyra2h-4way.c Normal file
View File

@@ -0,0 +1,128 @@
#include "lyra2h-gate.h"
#ifdef LYRA2H_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
__thread uint64_t* lyra2h_4way_matrix;
bool lyra2h_4way_thread_init()
{
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
lyra2h_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2h_4way_hash( hash, vdata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

25
algo/lyra2/lyra2h-gate.c Normal file
View File

@@ -0,0 +1,25 @@
#include "lyra2h-gate.h"
#include "lyra2.h"
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
#ifdef LYRA2H_4WAY
gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h_4way;
gate->hash = (void*)&lyra2h_4way_hash;
#else
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

32
algo/lyra2/lyra2h-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef LYRA2H_GATE_H__
#define LYRA2H_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#define LYRA2H_4WAY
#endif
#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8
#if defined(LYRA2H_4WAY)
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_4way_thread_init();
#endif
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_thread_init();
#endif

View File

@@ -1,6 +1,6 @@
#include "lyra2h-gate.h"
#include <memory.h> #include <memory.h>
#include <mm_malloc.h> #include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2.h" #include "lyra2.h"
#include "algo/blake/sph_blake.h" #include "algo/blake/sph_blake.h"
@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;
bool lyra2h_thread_init() bool lyra2h_thread_init()
{ {
const int i = 16 * 16 * 96; lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
lyra2h_matrix = _mm_malloc( i, 64 );
return lyra2h_matrix; return lyra2h_matrix;
} }
@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
return 0; return 0;
} }
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

View File

@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
{ {
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1; return 1;
} }
} }

177
algo/lyra2/lyra2rev2-4way.c Normal file
View File

@@ -0,0 +1,177 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#ifdef __AVX2__
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
sph_bmw256_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
void init_lyra2rev2_4way_ctx()
{
// blake256_4way_init( &l2v2_4way_ctx.blake );
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
sph_bmw256_init( &l2v2_4way_ctx.bmw );
}
void lyra2rev2_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
sph_bmw256( &ctx.bmw, hash0, 32 );
sph_bmw256_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash1, 32 );
sph_bmw256_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash2, 32 );
sph_bmw256_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
sph_bmw256( &ctx.bmw, hash3, 32 );
sph_bmw256_close( &ctx.bmw, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,38 @@
#include "lyra2rev2-gate.h"
__thread uint64_t* l2v2_wholeMatrix;
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
init_lyra2rev2_ctx();
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -0,0 +1,35 @@
#ifndef LYRA2REV2_GATE_H__
#define LYRA2REV2_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(HASH_4WAY)
#define LYRA2REV2_4WAY
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_4way_ctx();
#endif
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_lyra2rev2_ctx();
#endif

View File

@@ -1,20 +1,12 @@
#include "lyra2rev2-gate.h"
#include <memory.h> #include <memory.h>
#include "algo-gate-api.h"
#include "algo/blake/sph_blake.h" #include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h" #include "algo/cubehash/sph_cubehash.h"
#include "algo/keccak/sph_keccak.h" #include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h" #include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h" //#include "lyra2.h"
#include "avxdefs.h"
// This gets allocated when miner_thread starts up and is never freed.
// It's not a leak because the only way to allocate it again is to exit
// the thread and that only occurs when the entire program exits.
__thread uint64_t* l2v2_wholeMatrix;
typedef struct { typedef struct {
cubehashParam cube1; cubehashParam cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
if( fulltest(hash, ptarget) ) if( fulltest(hash, ptarget) )
{ {
pdata[19] = nonce; pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
return 1; return 1;
} }
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
return 0; return 0;
} }
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -85,8 +85,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( noncep2, n+2 ); be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 ); be32enc( noncep3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata ); lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{ {

View File

@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
return lyra2z_wholeMatrix;
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
hash_str, hash_str,
target_str); target_str);
} }
work_set_target_ratio( work, hash );
pdata[19] = data[19]; pdata[19] = data[19];
goto out; goto out;
} }

View File

@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
pdata[0] = tmpdata[0]; pdata[0] = tmpdata[0];
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
work_set_target_ratio( work, hash );
if (opt_debug) if (opt_debug)
applog(LOG_INFO, "found nonce %x", nonce); applog(LOG_INFO, "found nonce %x", nonce);
return 1; return 1;

207
algo/quark/quark-4way.c Normal file
View File

@@ -0,0 +1,207 @@
#include "cpuminer-config.h"
#include "quark-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} quark_4way_ctx_holder;
quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
void init_quark_4way_ctx()
{
blake512_4way_init( &quark_4way_ctx.blake );
bmw512_4way_init( &quark_4way_ctx.bmw );
init_groestl( &quark_4way_ctx.groestl, 64 );
skein512_4way_init( &quark_4way_ctx.skein );
jh512_4way_init( &quark_4way_ctx.jh );
keccak512_4way_init( &quark_4way_ctx.keccak );
}
void quark_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
mm256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
quark_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash );
}
if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash );
}
if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/quark/quark-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "quark-gate.h"
bool register_quark_algo( algo_gate_t* gate )
{
#if defined (QUARK_4WAY)
init_quark_4way_ctx();
gate->scanhash = (void*)&scanhash_quark_4way;
gate->hash = (void*)&quark_4way_hash;
#else
init_quark_ctx();
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

32
algo/quark/quark-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUARK_GATE_H__
#define QUARK_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define QUARK_4WAY
#endif
bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h" #include "cpuminer-config.h"
#include "algo-gate-api.h" #include "quark-gate.h"
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
#endif #endif
} }
inline static void quarkhash(void *state, const void *input) void quark_hash(void *state, const void *input)
{ {
unsigned char hashbuf[128]; unsigned char hashbuf[128];
size_t hashptr; size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
do { do {
pdata[19] = ++n; pdata[19] = ++n;
be32enc(&endiandata[19], n); be32enc(&endiandata[19], n);
quarkhash(hash64, &endiandata); quark_hash(hash64, &endiandata);
if ((hash64[7]&0xFFFFFF00)==0) if ((hash64[7]&0xFFFFFF00)==0)
{ {
if (fulltest(hash64, ptarget)) if (fulltest(hash64, ptarget))
{ {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
bool register_quark_algo( algo_gate_t* gate )
{
init_quark_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quarkhash;
return true;
};

View File

@@ -122,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -134,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) { if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
*hashes_done = n - pdata[19] + 1; *hashes_done = n - pdata[19] + 1;
pdata[19] = data[i * 20 + 19]; pdata[19] = data[i * 20 + 19];
work_set_target_ratio( work, hash );
return 1; return 1;
} }
} }

View File

@@ -114,7 +114,7 @@ available_implementations() {
return flags; return flags;
} }
#endif #endif
/*
static int static int
scrypt_test_mix() { scrypt_test_mix() {
static const uint8_t expected[16] = { static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {
return ret; return ret;
} }
*/

View File

@@ -26,7 +26,7 @@
#include "scrypt-jane-pbkdf2.h" #include "scrypt-jane-pbkdf2.h"
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
/*
static int static int
scrypt_test_hash() { scrypt_test_hash() {
scrypt_hash_state st; scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
scrypt_hash_finish(&st, final); scrypt_hash_finish(&st, final);
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
} }
*/

View File

@@ -342,17 +342,6 @@ do { \
do { \ do { \
sph_u64 t0, t1, t2; \ sph_u64 t0, t1, t2; \
__m256i h8; \ __m256i h8; \
/* can LE be assumed? \
dec64le does nothing when SPH_LITTLE endian is set, as it is. \
__m256i m0 = _mm256_dec64le( buf ); \
__m256i m1 = _mm256_dec64le( buf + 8*4 ); \
__m256i m2 = _mm256_dec64le( buf + 16*4 ); \
__m256i m3 = _mm256_dec64le( buf + 24*4 ); \
__m256i m4 = _mm256_dec64le( buf + 32*4 ); \
__m256i m5 = _mm256_dec64le( buf + 40*4 ); \
__m256i m6 = _mm256_dec64le( buf + 48*4 ); \
__m256i m7 = _mm256_dec64le( buf + 56*4 ); \
*/ \
__m256i m0 = buf[0]; \ __m256i m0 = buf[0]; \
__m256i m1 = buf[1]; \ __m256i m1 = buf[1]; \
__m256i m2 = buf[2]; \ __m256i m2 = buf[2]; \

View File

@@ -39,7 +39,9 @@
*/ */
#ifndef __SKEIN_HASH_4WAY_H__ #ifndef __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__ #define __SKEIN_HASH_4WAY_H__ 1
#ifdef __AVX2__
#ifdef __cplusplus #ifdef __cplusplus
extern "C"{ extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
#define SPH_SIZE_skein256 256 #define SPH_SIZE_skein256 256
#define SPH_SIZE_skein512 512 #define SPH_SIZE_skein512 512
#ifdef __AVX2__
typedef struct { typedef struct {
__m256i buf[8] __attribute__ ((aligned (32))); __m256i buf[8] __attribute__ ((aligned (32)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7; __m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr; size_t ptr;
sph_u64 bcount; sph_u64 bcount;
} skein512_4way_context; } sph_skein_4way_big_context;
typedef sph_skein_4way_big_context skein512_4way_context;
typedef sph_skein_4way_big_context skein256_4way_context;
void skein512_4way_init(void *cc); void skein512_4way_init(void *cc);
void skein512_4way(void *cc, const void *data, size_t len); void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
//void sph_skein512_addbits_and_close( //void sph_skein512_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst); // void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[8] __attribute__ ((aligned (32)));
__m128i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein256_4way_context;
void skein256_4way_init(void *cc); void skein256_4way_init(void *cc);
void skein256_4way(void *cc, const void *data, size_t len); void skein256_4way(void *cc, const void *data, size_t len);
void skein256_4way_close(void *cc, void *dst); void skein256_4way_close(void *cc, void *dst);
//void sph_skein256_addbits_and_close( //void sph_skein256_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst); // void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
#endif

231
algo/sm3/sm3-hash-4way.c Normal file
View File

@@ -0,0 +1,231 @@
/* ====================================================================
* Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#include <string.h>
#include "sm3-hash-4way.h"
#ifdef __AVX__
void sm3_4way_init( sm3_4way_ctx_t *ctx )
{
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
ctx->nblocks = 0;
ctx->num = 0;
}
void sm3_4way( void *cc, const void *data, size_t len )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *block = (__m128i*)ctx->block;
__m128i *vdata = (__m128i*)data;
if ( ctx->num )
{
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
if ( len < left )
{
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
ctx->num += len;
return;
}
else
{
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
sm3_4way_compress( ctx->digest, block );
ctx->nblocks++;
vdata += left>>2;
len -= left;
}
}
while ( len >= SM3_BLOCK_SIZE )
{
sm3_4way_compress( ctx->digest, vdata );
ctx->nblocks++;
vdata += SM3_BLOCK_SIZE>>2;
len -= SM3_BLOCK_SIZE;
}
ctx->num = len;
if ( len )
memcpy_128( block, vdata, len>>2 );
}
void sm3_4way_close( void *cc, void *dst )
{
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
__m128i *hash = (__m128i*)dst;
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
__m128i *block = (__m128i*)ctx->block;
int i;
block[ctx->num] = _mm_set1_epi32( 0x80 );
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
}
else
{
memset_zero_128( block + (ctx->num >> 2) + 1,
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
sm3_4way_compress( ctx->digest, block );
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
}
count[0] = mm_byteswap_32(
_mm_set1_epi32( ctx->nblocks >> 23 ) );
count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
( ctx->num << 3 ) ) );
sm3_4way_compress( ctx->digest, block );
for ( i = 0; i < 8 ; i++ )
hash[i] = mm_byteswap_32( ctx->digest[i] );
}
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \
mm_rotl_32( x, 17 ) ) )
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
mm_rotl_32( x, 23 ) ) )
#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
_mm_and_si128( x, z ) ), \
_mm_and_si128( y, z ) )
#define GG0(x,y,z) FF0(x,y,z)
#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
_mm_andnot_si128( x, z ) )
void sm3_4way_compress( __m128i *digest, __m128i *block )
{
__m128i W[68], W1[64];
__m128i A = digest[ 0 ];
__m128i B = digest[ 1 ];
__m128i C = digest[ 2 ];
__m128i D = digest[ 3 ];
__m128i E = digest[ 4 ];
__m128i F = digest[ 5 ];
__m128i G = digest[ 6 ];
__m128i H = digest[ 7 ];
__m128i SS1, SS2, TT1, TT2, T;
int j;
for ( j = 0; j < 16; j++ )
W[j] = mm_byteswap_32( block[j] );
for ( j = 16; j < 68; j++ )
W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
W[ j-9 ] ),
mm_rotl_32( W[ j-3 ], 15 ) ) ),
_mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
W[ j-6 ] ) );
for( j = 0; j < 64; j++ )
W1[j] = _mm_xor_si128( W[j], W[j+4] );
T = _mm_set1_epi32( 0x79CC4519UL );
for( j =0; j < 16; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
T = _mm_set1_epi32( 0x7A879D8AUL );
for( j =16; j < 64; j++ )
{
SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
mm_rotl_32( T, j&31 ) ), 7 );
SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ),
SS2 ), W1[j] );
TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
SS1 ), W[j] );
D = C;
C = mm_rotl_32( B, 9 );
B = A;
A = TT1;
H = G;
G = mm_rotl_32( F, 19 );
F = E;
E = P0( TT2 );
}
digest[0] = _mm_xor_si128( digest[0], A );
digest[1] = _mm_xor_si128( digest[1], B );
digest[2] = _mm_xor_si128( digest[2], C );
digest[3] = _mm_xor_si128( digest[3], D );
digest[4] = _mm_xor_si128( digest[4], E );
digest[5] = _mm_xor_si128( digest[5], F );
digest[6] = _mm_xor_si128( digest[6], G );
digest[7] = _mm_xor_si128( digest[7], H );
}
#endif

89
algo/sm3/sm3-hash-4way.h Normal file
View File

@@ -0,0 +1,89 @@
/* ====================================================================
* Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. All advertising materials mentioning features or use of this
* software must display the following acknowledgment:
* "This product includes software developed by the GmSSL Project.
* (http://gmssl.org/)"
*
* 4. The name "GmSSL Project" must not be used to endorse or promote
* products derived from this software without prior written
* permission. For written permission, please contact
* guanzhi1980@gmail.com.
*
* 5. Products derived from this software may not be called "GmSSL"
* nor may "GmSSL" appear in their names without prior written
* permission of the GmSSL Project.
*
* 6. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by the GmSSL Project
* (http://gmssl.org/)"
*
* THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* ====================================================================
*/
#ifndef SPH_SM3_HASH_4WAY_H
#define SPH_SM3_HASH_4WAY_H
#define SM3_DIGEST_LENGTH 32
#define SM3_BLOCK_SIZE 64
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "avxdefs.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
__m128i block[16] __attribute__ ((aligned (64)));
__m128i digest[8];
uint32_t nblocks;
uint32_t num;
} sm3_4way_ctx_t;
void sm3_4way_init( sm3_4way_ctx_t *ctx );
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
// size_t data_len );
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
// unsigned char digest[SM3_DIGEST_LENGTH] );
void sm3_4way_compress( __m128i *digest, __m128i *block );
void sm3_4way(void *cc, const void *data, size_t len);
void sm3_4way_close(void *cc, void *dst);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
for(j =16; j < 64; j++) { for(j =16; j < 64; j++) {
T[j] = 0x7A879D8A; T[j] = 0x7A879D8A;
SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7); SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
SS2 = SS1 ^ ROTATELEFT(A,12); SS2 = SS1 ^ ROTATELEFT(A,12);
TT1 = FF1(A,B,C) + D + SS2 + W1[j]; TT1 = FF1(A,B,C) + D + SS2 + W1[j];
TT2 = GG1(E,F,G) + H + SS1 + W[j]; TT2 = GG1(E,F,G) + H + SS1 + W[j];

View File

@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
for (i = 0; i < 8; i ++) \ for (i = 0; i < 8; i ++) \
sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \ sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
} }
// sph_ ## name ## _init(cc); \ /*
//} sph_ ## name ## _init(cc); \
}
*/
MAKE_CLOSE(whirlpool) MAKE_CLOSE(whirlpool)
MAKE_CLOSE(whirlpool0) MAKE_CLOSE(whirlpool0)
MAKE_CLOSE(whirlpool1) MAKE_CLOSE(whirlpool1)

View File

@@ -22,6 +22,7 @@ void whirlpool_hash( void *state, const void *input );
int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done ); uint64_t *hashes_done );
void init_whirlpool_ctx();
#endif #endif
#endif #endif

View File

@@ -3345,8 +3345,10 @@ do { \
#define READ_STATE MUL8(READ_STATE_W) #define READ_STATE MUL8(READ_STATE_W)
#define ROUND0 MUL8(ROUND0_W) #define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W) #define UPDATE_STATE MUL8(UPDATE_STATE_W)
//#define BYTE(x, n) \ /*
// _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) ) #define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
*/
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) #define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)

View File

@@ -162,6 +162,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
{ {
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1; return 1;
} }
nonce++; nonce++;

274
algo/x11/timetravel-4way.c Normal file
View File

@@ -0,0 +1,274 @@
#include "timetravel-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
} tt8_4way_ctx_holder;
tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
void init_tt8_4way_ctx()
{
blake512_4way_init( &tt8_4way_ctx.blake );
bmw512_4way_init( &tt8_4way_ctx.bmw );
init_groestl( &tt8_4way_ctx.groestl, 64 );
skein512_4way_init( &tt8_4way_ctx.skein );
jh512_4way_init( &tt8_4way_ctx.jh );
keccak512_4way_init( &tt8_4way_ctx.keccak );
init_luffa( &tt8_4way_ctx.luffa, 512 );
cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
};
void timetravel_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 7 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 7 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel-gate.h"
void tt8_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL_4WAY
init_tt8_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel_4way;
gate->hash = (void*)&timetravel_4way_hash;
#else
init_tt8_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
#endif
gate->set_target = (void*)&tt8_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
void tt8_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,40 @@
#ifndef TIMETRAVEL_GATE_H__
#define TIMETRAVEL_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL_4WAY
#endif
// Machinecoin Genesis Timestamp
#define TT8_FUNC_BASE_TIMESTAMP 1389040865
#define TT8_FUNC_COUNT 8
#define TT8_FUNC_COUNT_PERMUTATIONS 40320
void tt8_next_permutation( int *pbegin, int *pend );
bool register_timetravel_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL_4WAY)
void timetravel_4way_hash( void *state, const void *input );
int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_4way_ctx();
#endif
void timetravel_hash( void *state, const void *input );
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt8_ctx();
#endif

View File

@@ -1,11 +1,9 @@
#include "algo-gate-api.h" #include "timetravel-gate.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h" #include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h" #include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h" #include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
#include "algo/skein/sph_skein.h" #include "algo/skein/sph_skein.h"
#include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI #ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h" #include "algo/groestl/sph_groestl.h"
#else #else
#include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/groestl/aes_ni/hash-groestl.h"
#endif #endif
// Machinecoin Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1389040865
#define HASH_FUNC_COUNT 8
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 }; static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
inline void tt_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
typedef struct { typedef struct {
sph_blake512_context blake; sph_blake512_context blake;
@@ -101,7 +38,7 @@ typedef struct {
tt_ctx_holder tt_ctx __attribute__ ((aligned (64))); tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
__thread tt_ctx_holder tt_mid __attribute__ ((aligned (64))); __thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));
void init_tt_ctx() void init_tt8_ctx()
{ {
sph_blake512_init( &tt_ctx.blake ); sph_blake512_init( &tt_ctx.blake );
sph_bmw512_init( &tt_ctx.bmw ); sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()
void timetravel_hash(void *output, const void *input) void timetravel_hash(void *output, const void *input)
{ {
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64))); uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB; uint32_t *hashA, *hashB;
tt_ctx_holder ctx __attribute__ ((aligned (64))); tt_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64; uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)
memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) ); memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ ) for ( i = 0; i < TT8_FUNC_COUNT; i++ )
{ {
if (i == 0) if (i == 0)
{ {
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
} }
} }
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32); memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
} }
int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17]; const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime ) if ( timestamp != s_ntime )
{ {
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP ) const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS; % TT8_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ ) for ( i = 0; i < TT8_FUNC_COUNT; i++ )
permutation[i] = i; permutation[i] = i;
for ( i = 0; i < steps; i++ ) for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT ); tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
s_ntime = timestamp; s_ntime = timestamp;
// do midstate precalc for first function // do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
work_set_target_ratio( work, hash ); work_set_target_ratio( work, hash );
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1; return 1;
} }
nonce++; nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
void timetravel_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt_ctx();
gate->scanhash = (void*)&scanhash_timetravel;
gate->hash = (void*)&timetravel_hash;
gate->set_target = (void*)&timetravel_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -0,0 +1,316 @@
#include "timetravel10-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
} tt10_4way_ctx_holder;
tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
void init_tt10_4way_ctx()
{
blake512_4way_init( &tt10_4way_ctx.blake );
bmw512_4way_init( &tt10_4way_ctx.bmw );
init_groestl( &tt10_4way_ctx.groestl, 64 );
skein512_4way_init( &tt10_4way_ctx.skein );
jh512_4way_init( &tt10_4way_ctx.jh );
keccak512_4way_init( &tt10_4way_ctx.keccak );
init_luffa( &tt10_4way_ctx.luffa, 512 );
cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_4way_ctx.shavite );
init_sd( &tt10_4way_ctx.simd, 512 );
};
void timetravel10_4way_hash(void *output, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
uint64_t *vhashA, *vhashB;
tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64;
int i;
memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{
if (i == 0)
{
dataLen = 80;
vhashA = (uint64_t*)input;
vhashB = vhashX;
}
else
{
dataLen = 64;
if ( i % 2 == 0 )
{
vhashA = vhashY;
vhashB = vhashX;
}
else
{
vhashA = vhashX;
vhashB = vhashY;
}
}
switch ( permutation[i] )
{
case 0:
blake512_4way( &ctx.blake, vhashA, dataLen );
blake512_4way_close( &ctx.blake, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhashA, dataLen );
bmw512_4way_close( &ctx.bmw, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, dataLen<<3 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhashA, dataLen );
skein512_4way_close( &ctx.skein, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhashA, dataLen );
jh512_4way_close( &ctx.jh, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhashA, dataLen );
keccak512_4way_close( &ctx.keccak, vhashB );
if ( i == 9 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashB, dataLen<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence *)hash0, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
if ( i != 9 )
mm256_interleave_4x64( vhashB,
hash0, hash1, hash2, hash3, dataLen<<3 );
break;
default:
applog(LOG_ERR,"SWERR: timetravel invalid permutation");
break;
}
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
int i;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime )
{
const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i;
for ( i = 0; i < steps; i++ )
tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
timetravel10_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,78 @@
#include "timetravel10-gate.h"
void tt10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
#ifdef TIMETRAVEL10_4WAY
init_tt10_4way_ctx();
gate->scanhash = (void*)&scanhash_timetravel10_4way;
gate->hash = (void*)&timetravel10_4way_hash;
#else
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
#endif
gate->set_target = (void*)&tt10_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
void tt10_next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}

View File

@@ -0,0 +1,39 @@
#ifndef TIMETRAVEL10_GATE_H__
#define TIMETRAVEL10_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define TIMETRAVEL10_4WAY
#endif
// BitCore Genesis Timestamp
#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
#define TT10_FUNC_COUNT 10
#define TT10_FUNC_COUNT_PERMUTATIONS 40320
void tt10_next_permutation( int *pbegin, int *pend );
bool register_timetravel10_algo( algo_gate_t* gate );
#if defined(TIMETRAVEL10_4WAY)
void timetravel10_4way_hash( void *state, const void *input );
int scanhash_timetravel10_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done );
void init_tt10_4way_ctx();
#endif
void timetravel10_hash( void *state, const void *input );
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_tt10_ctx();
#endif

View File

@@ -1,11 +1,8 @@
#include "algo-gate-api.h" #include "timetravel10-gate.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/sph_blake.h" #include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h" #include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h" #include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
#include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/groestl/aes_ni/hash-groestl.h"
#endif #endif
// BitCore Genesis Timestamp
#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
#define HASH_FUNC_COUNT 10
#define HASH_FUNC_COUNT_PERMUTATIONS 40320
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[HASH_FUNC_COUNT] = { 0 }; static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
inline void tt10_swap( int *a, int *b )
{
int c = *a;
*a = *b;
*b = c;
}
inline void reverse( int *pbegin, int *pend )
{
while ( (pbegin != pend) && (pbegin != --pend) )
{
tt10_swap( pbegin, pend );
pbegin++;
}
}
static void next_permutation( int *pbegin, int *pend )
{
if ( pbegin == pend )
return;
int *i = pbegin;
++i;
if ( i == pend )
return;
i = pend;
--i;
while (1)
{
int *j = i;
--i;
if ( *i < *j )
{
int *k = pend;
while ( !(*i < *--k) ) /* do nothing */ ;
tt10_swap( i, k );
reverse(j, pend);
return; // true
}
if ( i == pbegin )
{
reverse(pbegin, pend);
return; // false
}
// else?
}
}
typedef struct { typedef struct {
sph_blake512_context blake; sph_blake512_context blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()
void timetravel10_hash(void *output, const void *input) void timetravel10_hash(void *output, const void *input)
{ {
uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64))); uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
uint32_t *hashA, *hashB; uint32_t *hashA, *hashB;
tt10_ctx_holder ctx __attribute__ ((aligned (64))); tt10_ctx_holder ctx __attribute__ ((aligned (64)));
uint32_t dataLen = 64; uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)
memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) ); memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );
for ( i = 0; i < HASH_FUNC_COUNT; i++ ) for ( i = 0; i < TT10_FUNC_COUNT; i++ )
{ {
if (i == 0) if (i == 0)
{ {
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
} }
} }
memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32); memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
} }
int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce, int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t timestamp = endiandata[17]; const uint32_t timestamp = endiandata[17];
if ( timestamp != s_ntime ) if ( timestamp != s_ntime )
{ {
const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP ) const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
% HASH_FUNC_COUNT_PERMUTATIONS; % TT10_FUNC_COUNT_PERMUTATIONS;
for ( i = 0; i < HASH_FUNC_COUNT; i++ ) for ( i = 0; i < TT10_FUNC_COUNT; i++ )
permutation[i] = i; permutation[i] = i;
for ( i = 0; i < steps; i++ ) for ( i = 0; i < steps; i++ )
next_permutation( permutation, permutation + HASH_FUNC_COUNT ); tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
s_ntime = timestamp; s_ntime = timestamp;
// do midstate precalc for first function // do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
{ {
work_set_target_ratio( work, hash ); work_set_target_ratio( work, hash );
pdata[19] = nonce; pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
return 1; return 1;
} }
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
return 0; return 0;
} }
void timetravel10_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_timetravel10_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_tt10_ctx();
gate->scanhash = (void*)&scanhash_timetravel10;
gate->hash = (void*)&timetravel10_hash;
gate->set_target = (void*)&timetravel10_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( hash64, ptarget ) ) if ( fulltest( hash64, ptarget ) )
{ {
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true; return true;
} }
} }
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }
/*
bool register_x11_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

340
algo/x11/x11evo-4way.c Normal file
View File

@@ -0,0 +1,340 @@
#include "cpuminer-config.h"
#include "x11evo-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <string.h>
#include <stdint.h>
#include <compat/portable_endian.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11evo_4way_ctx_holder;
static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
void init_x11evo_4way_ctx()
{
blake512_4way_init( &x11evo_4way_ctx.blake );
bmw512_4way_init( &x11evo_4way_ctx.bmw );
init_groestl( &x11evo_4way_ctx.groestl, 64 );
skein512_4way_init( &x11evo_4way_ctx.skein );
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
init_sd( &x11evo_4way_ctx.simd, 512 );
init_echo( &x11evo_4way_ctx.echo, 512 );
}
static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX;
void x11evo_4way_hash( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
if ( s_seq == -1 )
{
uint32_t *data = (uint32_t*) input;
const uint32_t ntime = data[17];
evo_twisted_code( ntime, hashOrder );
}
int i;
int len = strlen( hashOrder );
for ( i = 0; i < len; i++ )
{
char elem = hashOrder[i];
uint8_t idx;
if ( elem >= 'A' )
idx = elem - 'A' + 10;
else
idx = elem - '0';
// int size = 64;
switch ( idx )
{
case 0:
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 1:
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 2:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 3:
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 4:
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 5:
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
if ( i >= len-1 )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
break;
case 6:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 7:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*) hash3, 64 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 8:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 9:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
case 10:
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
vhash, 64<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
if ( i < len-1 )
mm256_interleave_4x64( vhash,
hash0, hash1, hash2, hash3, 64<<3 );
break;
}
}
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
//static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
swab32_array( endiandata, pdata, 20 );
int ntime = endiandata[17];
if ( ntime != s_ntime || s_seq == -1 )
{
evo_twisted_code( ntime, hashOrder );
s_ntime = ntime;
}
uint32_t hmask = 0xFFFFFFFF;
if ( Htarg > 0 )
{
if ( Htarg <= 0xF )
hmask = 0xFFFFFFF0;
else if ( Htarg <= 0xFF )
hmask = 0xFFFFFF00;
else if ( Htarg <= 0xFFF )
hmask = 0xFFFF000;
else if ( Htarg <= 0xFFFF )
hmask = 0xFFFF000;
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11evo_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

95
algo/x11/x11evo-gate.c Normal file
View File

@@ -0,0 +1,95 @@
#include "x11evo-gate.h"
int s_seq = -1;
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[X11EVO_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, X11EVO_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, X11EVO_FUNC_COUNT );
sptr = str;
for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
void evo_twisted_code( uint32_t ntime, char *permstr )
{
int seq = getCurrentAlgoSeq( ntime );
if ( s_seq != seq )
{
getAlgoString( permstr, seq );
s_seq = seq;
}
}
bool register_x11evo_algo( algo_gate_t* gate )
{
#if defined (X11EVO_4WAY)
init_x11evo_4way_ctx();
gate->scanhash = (void*)&scanhash_x11evo_4way;
gate->hash = (void*)&x11evo_4way_hash;
#else
init_x11evo_ctx();
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

39
algo/x11/x11evo-gate.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef X11EVO_GATE_H__
#define X11EVO_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11EVO_4WAY
#endif
#define X11EVO_INITIAL_DATE 1462060800
#define X11EVO_FUNC_COUNT 11
extern int s_seq;
bool register_x11evo_algo( algo_gate_t* gate );
#if defined(X11EVO_4WAY)
void x11evo_4way_hash( void *state, const void *input );
int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_4way_ctx();
#endif
void x11evo_hash( void *state, const void *input );
int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11evo_ctx();
void evo_twisted_code( uint32_t ntime, char *permstr );
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h" #include "cpuminer-config.h"
#include "algo-gate-api.h" #include "x11evo-gate.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
@@ -26,9 +26,6 @@
#include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/sse2/nist.h"
#define INITIAL_DATE 1462060800
#define HASH_FUNC_COUNT 11
typedef struct { typedef struct {
#ifdef NO_AES_NI #ifdef NO_AES_NI
sph_groestl512_context groestl; sph_groestl512_context groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
sph_shavite512_init( &x11evo_ctx.shavite ); sph_shavite512_init( &x11evo_ctx.shavite );
} }
/* static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
{
return (current_time - base_time) / (60 * 60 * 24);
}
*/
static inline int getCurrentAlgoSeq( uint32_t current_time )
{
// change once per day
return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
}
// swap_vars doesn't work here
void evo_swap( uint8_t *a, uint8_t *b )
{
uint8_t __tmp = *a;
*a = *b;
*b = __tmp;
}
void initPerm( uint8_t n[], uint8_t count )
{
int i;
for ( i = 0; i<count; i++ )
n[i] = i;
}
int nextPerm( uint8_t n[], uint32_t count )
{
uint32_t tail = 0, i = 0, j = 0;
if (unlikely( count <= 1 ))
return 0;
for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
tail = i;
if ( tail > 0 )
for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
evo_swap( &n[tail - 1], &n[j] );
for ( i = tail, j = count - 1; i<j; i++, j-- )
evo_swap( &n[i], &n[j] );
return ( tail != 0 );
}
void getAlgoString( char *str, uint32_t count )
{
uint8_t algoList[HASH_FUNC_COUNT];
char *sptr;
int j;
int k;
initPerm( algoList, HASH_FUNC_COUNT );
for ( k = 0; k < count; k++ )
nextPerm( algoList, HASH_FUNC_COUNT );
sptr = str;
for ( j = 0; j < HASH_FUNC_COUNT; j++ )
{
if ( algoList[j] >= 10 )
sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
else
sprintf( sptr, "%u", algoList[j] );
sptr++;
}
*sptr = 0;
//applog(LOG_DEBUG, "nextPerm %s", str);
}
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
static int s_seq = -1;
static void evo_twisted_code(uint32_t ntime, char *permstr) void x11evo_hash( void *state, const void *input )
{
int seq = getCurrentAlgoSeq(ntime);
if (s_seq != seq)
{
getAlgoString(permstr, seq);
s_seq = seq;
}
}
static inline void x11evo_hash( void *state, const void *input )
{ {
uint32_t hash[16] __attribute__ ((aligned (64))); uint32_t hash[16] __attribute__ ((aligned (64)));
x11evo_ctx_holder ctx __attribute__ ((aligned (64))); x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
memcpy( state, hash, 32 ); memcpy( state, hash, 32 );
} }
static const uint32_t diff1targ = 0x0000ffff; //static const uint32_t diff1targ = 0x0000ffff;
int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce, int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done ) uint64_t *hashes_done )
{ {
uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64))); uint32_t hash64[8] __attribute__((aligned(64)));
@@ -281,12 +194,13 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
{ {
pdata[19] = ++n; pdata[19] = ++n;
be32enc( &endiandata[19], n ); be32enc( &endiandata[19], n );
x11evo_hash( hash64, &endiandata ); x11evo_hash( hash64, endiandata );
if ( ( hash64[7] & hmask ) == 0 ) if ( ( hash64[7] & hmask ) == 0 )
{ {
if ( fulltest( hash64, ptarget ) ) if ( fulltest( hash64, ptarget ) )
{ {
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true; return true;
} }
} }
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }
bool register_x11evo_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_x11evo;
gate->hash = (void*)&x11evo_hash;
init_x11evo_ctx();
return true;
};

View File

@@ -161,6 +161,7 @@ int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
if (hash[7] <= Htarg && fulltest(hash, ptarget)) { if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1; return 1;
} }
nonce++; nonce++;

View File

@@ -116,6 +116,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
if (hash[7] <= Htarg && fulltest(hash, ptarget)) { if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce; pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
return 1; return 1;
} }

View File

@@ -70,6 +70,7 @@ int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
{ {
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1; return 1;
} }
nonce++; nonce++;

View File

@@ -234,6 +234,7 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -17,7 +17,7 @@
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h" #include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sph_sm3.h" #include "algo/sm3/sm3-hash-4way.h"
#include "algo/hamsi/sph_hamsi.h" #include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h" #include "algo/fugue/sph_fugue.h"
@@ -33,7 +33,7 @@ typedef struct {
sph_shavite512_context shavite; sph_shavite512_context shavite;
hashState_sd simd; hashState_sd simd;
hashState_echo echo; hashState_echo echo;
sm3_ctx_t sm3; sm3_4way_ctx_t sm3;
sph_hamsi512_context hamsi; sph_hamsi512_context hamsi;
sph_fugue512_context fugue; sph_fugue512_context fugue;
} x13sm3_4way_ctx_holder; } x13sm3_4way_ctx_holder;
@@ -54,7 +54,7 @@ void init_x13sm3_4way_ctx()
sph_shavite512_init( &x13sm3_4way_ctx.shavite ); sph_shavite512_init( &x13sm3_4way_ctx.shavite );
init_sd( &x13sm3_4way_ctx.simd, 512 ); init_sd( &x13sm3_4way_ctx.simd, 512 );
init_echo( &x13sm3_4way_ctx.echo, 512 ); init_echo( &x13sm3_4way_ctx.echo, 512 );
sm3_init( &x13sm3_4way_ctx.sm3 ); sm3_4way_init( &x13sm3_4way_ctx.sm3 );
sph_hamsi512_init( &x13sm3_4way_ctx.hamsi ); sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
sph_fugue512_init( &x13sm3_4way_ctx.fugue ); sph_fugue512_init( &x13sm3_4way_ctx.fugue );
}; };
@@ -85,14 +85,11 @@ void x13sm3_4way_hash( void *state, const void *input )
// Groestl // Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, reinit_groestl( &ctx.groestl );
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, reinit_groestl( &ctx.groestl );
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl, reinit_groestl( &ctx.groestl );
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way // Parallel 4way
@@ -178,6 +175,8 @@ void x13sm3_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
// SM3 // SM3
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 ); memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32))); uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
@@ -187,17 +186,11 @@ void x13sm3_4way_hash( void *state, const void *input )
uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 ); memset( sm3_hash3, 0, sizeof sm3_hash3 );
sph_sm3( &ctx.sm3, hash0, 64 ); mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sph_sm3_close( &ctx.sm3, sm3_hash0 ); sm3_4way( &ctx.sm3, vhash, 64 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) ); sm3_4way_close( &ctx.sm3, sm3_vhash );
sph_sm3( &ctx.sm3, hash1, 64 ); mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3,
sph_sm3_close( &ctx.sm3, sm3_hash1 ); sm3_vhash, 1024 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash2, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash2 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash3, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash3 );
// Hamsi // Hamsi
sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 ); sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );

View File

@@ -224,6 +224,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -65,6 +65,7 @@ int scanhash_axiom(int thr_id, struct work *work,
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
pdata[19] = n; pdata[19] = n;
work_set_target_ratio( work, hash64 );
return true; return true;
} }
n++; n++;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <stdint.h> #include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__) #if defined(__AVX2__) && defined(__AES__)
#define POLYTIMOS_4WAY #define POLYTIMOS_4WAY
#endif #endif

View File

@@ -233,6 +233,7 @@ int scanhash_x14(int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -47,7 +47,6 @@ void init_x15_4way_ctx()
{ {
blake512_4way_init( &x15_4way_ctx.blake ); blake512_4way_init( &x15_4way_ctx.blake );
bmw512_4way_init( &x15_4way_ctx.bmw ); bmw512_4way_init( &x15_4way_ctx.bmw );
sph_bmw512_init( &x15_4way_ctx.bmw );
init_groestl( &x15_4way_ctx.groestl, 64 ); init_groestl( &x15_4way_ctx.groestl, 64 );
skein512_4way_init( &x15_4way_ctx.skein ); skein512_4way_init( &x15_4way_ctx.skein );
jh512_4way_init( &x15_4way_ctx.jh ); jh512_4way_init( &x15_4way_ctx.jh );

View File

@@ -245,6 +245,7 @@ int scanhash_x15(int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }

View File

@@ -266,6 +266,7 @@ int scanhash_x17(int thr_id, struct work *work,
if (!(hash64[7] & mask)) { if (!(hash64[7] & mask)) {
printf("[%d]",thr_id); printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) { if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return true; return true;
} }
@@ -281,13 +282,3 @@ int scanhash_x17(int thr_id, struct work *work,
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }
/*
bool register_x17_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x17_ctx();
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17hash;
return true;
};
*/

View File

@@ -35,10 +35,18 @@
#define mm_one_64 _mm_set1_epi64x( 1ULL ) #define mm_one_64 _mm_set1_epi64x( 1ULL )
#define mm_one_32 _mm_set1_epi32( 1UL ) #define mm_one_32 _mm_set1_epi32( 1UL )
#define mm_one_16 _mm_set1_epi16( 1U ) #define mm_one_16 _mm_set1_epi16( 1U )
#define mm_one_8 _mm_set1_epi8( 1U )
// Constant minus 1 // Constant minus 1
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) #define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
// Lane index, useful for byte rotate using shuffle
#define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL );
#define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL );
#define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
#define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
// //
// Basic operations without equivalent SIMD intrinsic // Basic operations without equivalent SIMD intrinsic
@@ -327,6 +335,16 @@ inline __m128i mm_byteswap_16( __m128i x )
// Constant minus 1 // Constant minus 1
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) #define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
// Lane index, useful for rotate using permutevar
#define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL );
#define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL );
#define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
#define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \
23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \
15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \
7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U );
// //
// Basic operations without SIMD equivalent // Basic operations without SIMD equivalent
@@ -1109,7 +1127,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
} }
// Can't do it in place // Can't do it in place
inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len ) inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
{ {
__m256i* d = (__m256i*)dst; __m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src; uint32_t *s = (uint32_t*)src;
@@ -1146,7 +1164,8 @@ inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len )
// likely of no use. // likely of no use.
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
// bit_len must be multiple of 64 // bit_len must be multiple of 64
inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src, // broken
inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
int bit_len ) int bit_len )
{ {
uint32_t *d = (uint32_t*)dst; uint32_t *d = (uint32_t*)dst;
@@ -1200,6 +1219,7 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
// bit_len == 1024 // bit_len == 1024
} }
// not used
inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len ) inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
{ {
uint32_t *d = (uint32_t*)dst; uint32_t *d = (uint32_t*)dst;

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.9' PACKAGE_VERSION='3.7.10'
PACKAGE_STRING='cpuminer-opt 3.7.9' PACKAGE_STRING='cpuminer-opt 3.7.10'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";; short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.7.9 cpuminer-opt configure 3.7.10
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.9, which was It was created by cpuminer-opt $as_me 3.7.10, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.7.9' VERSION='3.7.10'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.7.9, which was This file was extended by cpuminer-opt $as_me 3.7.10, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.7.9 cpuminer-opt config.status 3.7.10
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.9]) AC_INIT([cpuminer-opt], [3.7.10])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -346,6 +346,7 @@ static bool work_decode( const json_t *val, struct work *work )
work->targetdiff = target_to_diff( work->target ); work->targetdiff = target_to_diff( work->target );
// for api stats, on longpoll pools // for api stats, on longpoll pools
stratum_diff = work->targetdiff; stratum_diff = work->targetdiff;
work->sharediff = 0;
algo_gate.display_extra_data( work, &net_blocks ); algo_gate.display_extra_data( work, &net_blocks );
return true; return true;
} }
@@ -755,6 +756,7 @@ static int share_result( int result, struct work *work, const char *reason )
uint32_t total_submits; uint32_t total_submits;
float rate; float rate;
char rate_s[8] = {0}; char rate_s[8] = {0};
double sharediff = work ? work->sharediff : stratum.sharediff;
int i; int i;
pthread_mutex_lock(&stats_lock); pthread_mutex_lock(&stats_lock);
@@ -814,6 +816,8 @@ static int share_result( int result, struct work *work, const char *reason )
sprintf(hr, "%.2f", hashrate ); sprintf(hr, "%.2f", hashrate );
} }
if ( sharediff == 0 )
{
#if ((defined(_WIN64) || defined(__WINDOWS__))) #if ((defined(_WIN64) || defined(__WINDOWS__)))
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s", applog( LOG_NOTICE, "%s %lu/%lu (%s%%), %s %sH, %s %sH/s",
sres, ( result ? accepted_count : rejected_count ), sres, ( result ? accepted_count : rejected_count ),
@@ -824,6 +828,20 @@ static int share_result( int result, struct work *work, const char *reason )
total_submits, rate_s, hc, hc_units, hr, hr_units, total_submits, rate_s, hc, hc_units, hr, hr_units,
(uint32_t)cpu_temp(0) ); (uint32_t)cpu_temp(0) );
#endif #endif
}
else
{
#if ((defined(_WIN64) || defined(__WINDOWS__)))
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s",
sres, ( result ? accepted_count : rejected_count ),
total_submits, rate_s, sharediff, hr, hr_units );
#else
applog( LOG_NOTICE, "%s %lu/%lu (%s%%), diff %.3g, %s %sH/s, %dC",
sres, ( result ? accepted_count : rejected_count ),
total_submits, rate_s, sharediff, hr, hr_units,
(uint32_t)cpu_temp(0) );
#endif
}
if (reason) if (reason)
{ {
@@ -1026,6 +1044,7 @@ static bool submit_upstream_work( CURL *curl, struct work *work )
} }
if ( have_stratum ) if ( have_stratum )
{ {
stratum.sharediff = work->sharediff;
algo_gate.build_stratum_request( req, work, &stratum ); algo_gate.build_stratum_request( req, work, &stratum );
if ( unlikely( !stratum_send_line( &stratum, req ) ) ) if ( unlikely( !stratum_send_line( &stratum, req ) ) )
{ {
@@ -2984,25 +3003,22 @@ bool check_cpu_capability ()
} }
if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) )
{ {
if ( sw_has_4way && algo_has_4way ) printf( "The SW build requires a CPU with AES and AVX2!\n" );
printf( "A CPU with AES and AVX2 is required to use 4way!\n" );
else if ( algo_has_avx2 )
printf( "A CPU with AES and AVX2 is required!\n" );
return false; return false;
} }
if ( sw_has_avx && !( cpu_has_avx && cpu_has_aes ) ) if ( sw_has_avx && !cpu_has_avx )
{ {
printf( "A CPU with AES and AVX2 is required!\n" ); printf( "The SW build requires a CPU with AVX!\n" );
return false; return false;
} }
if ( sw_has_aes && algo_has_aes && !cpu_has_aes ) if ( sw_has_aes && !cpu_has_aes )
{ {
printf( "A CPU with AES is required!\n" ); printf( "The SW build requires a CPU with AES!\n" );
return false; return false;
} }
if ( sw_has_sha && algo_has_sha && !cpu_has_sha ) if ( sw_has_sha && !cpu_has_sha )
{ {
printf( "A CPU with SHA is required!\n" ); printf( "The SW build requires a CPU with SHA!\n" );
return false; return false;
} }
@@ -3187,6 +3203,9 @@ int main(int argc, char *argv[])
} }
#endif #endif
if ( num_cpus != opt_n_threads )
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
num_cpus, opt_n_threads );
if ( opt_affinity != -1 ) if ( opt_affinity != -1 )
{ {
if ( num_cpus > 64 ) if ( num_cpus > 64 )

View File

@@ -736,7 +736,7 @@ Options:\n\
whirlpool\n\ whirlpool\n\
whirlpoolx\n\ whirlpoolx\n\
x11 Dash\n\ x11 Dash\n\
x11evo Revolvercoin\n\ x11evo Revolvercoin (XRE)\n\
x11gost sib (SibCoin)\n\ x11gost sib (SibCoin)\n\
x13 X13\n\ x13 X13\n\
x13sm3 hsr (Hshare)\n\ x13sm3 hsr (Hshare)\n\