v3.21.3 Unreleased

This commit is contained in:
Jay D Dee
2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions

View File

@@ -1,6 +1,5 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
//#include "sph-blake2s.h"
#include <string.h>
#include <stdint.h>
@@ -8,43 +7,6 @@
static __thread blake2s_16way_state blake2s_16w_ctx;
/*
static blake2s_16way_state blake2s_16w_ctx;
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
*/
/*
int blake2s_16way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
return 1;
}
*/
/*
int blake2s_16way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
return 1;
}
*/
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
*/
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
static __thread blake2s_8way_state blake2s_8w_ctx;
/*
static blake2s_8way_state blake2s_8w_ctx;
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
int blake2s_8way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
}
*/
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
*/
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
/*
static blake2s_4way_state blake2s_4w_ctx;
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
int blake2s_4way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
blake2s_4w_ctx.t[0] = 64;
}
*/
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*/
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;

View File

@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
// gate->prehash = (void*)&blake2s_16way_prehash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
// gate->prehash = (void*)&blake2s_8way_prehash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
// gate->prehash = (void*)&blake2s_4way_prehash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;

View File

@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_16way_prehash( struct work * );
#elif defined (BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_8way_prehash( struct work * );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_4way_prehash( struct work * );
#else
void blake2s_hash( void *state, const void *input );

View File

@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \
#define ROUND4W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \
#define G8W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \
#define ROUND8W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
#define G16W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \
#define ROUND16W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \

View File

@@ -20,7 +20,6 @@
#include <stddef.h>
#include <stdint.h>
//#include "sph-blake2s.h"
#if defined(_MSC_VER)
#include <inttypes.h>
@@ -34,7 +33,7 @@
#if defined(__cplusplus)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -43,13 +42,6 @@ enum blake2s_constant
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_nway_param

View File

@@ -8,6 +8,8 @@
#include "sph-blake2s.h"
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash( void *output, const void *input )
{
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.buflen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
}
*/
int scanhash_blake2s( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
int thr_id = mythr->id;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
do {
be32enc(&endiandata[19], n);
blake2s_hash( hash64, endiandata );
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -17,7 +17,6 @@
#include "algo/sha/sph_types.h"
#include "sph-blake2s.h"
#include "simd-utils.h"
static const uint32_t blake2s_IV[8] =
{
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if 0
//#if defined(__SSE2__) // always true
The only application for this is to do a prehash for the blake2s algorithm.
SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
Testing has found that using this serial SIMD code for prehash is slower than
doing a parallel hash. A parallel hash has more instructions and uses more
data. The serial hash uses fewer instructions and data and only needs to
interleave the final hash into parallel streams. This has shown negligible
improvement on other algos, notably blake256 which is almost identical.
Considering the low frequency of prehash no statistically valid change
was expected. It was simply better on paper.
Furthermore, simply defining this macro has an additional negative effect on
blake2s as a whole. There are no references to this macro, blake2s-4way does
not include it in any header files, it's just another unused macro which should
have no effect beyond the preprocessor. But just being visible to the compiler
changes things in a dramatic way.
These 2 things combined reduced the hash rate for blake2s by more than 5% when
using serial SIMD for the blake2s prehash over 16way parallel prehash.
16way parallel hashing was used in the high frequency nonce loop in both cases.
Comsidering the prehash represents 50% of the algorithm and is done once vs
the high frequency second half that is done mega, maybe giga, times more it's
hard to imagine that big of an effect in either direction.
#define ROUND( r ) \
{ \
__m128i *V = (__m128i*)v; \
const uint8_t *sigma = blake2s_sigma[r]; \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shufll_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shuflr_32( V[1] ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shuflr_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shufll_32( V[1] ); \
}
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
#endif
ROUND( 0 );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );

View File

@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -100,13 +99,6 @@ extern "C" {
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_param

View File

@@ -103,16 +103,16 @@
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
V2 = mm128_alignr_64( V[3], V[2] ); \
V3 = mm128_alignr_64( V[2], V[3] ); \
V6 = mm128_alignr_64( V[6], V[7] ); \
V7 = mm128_alignr_64( V[7], V[6] ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
V[2] = mm128_alignr_64( V2, V3 ); \
V[3] = mm128_alignr_64( V3, V2 ); \
V[6] = mm128_alignr_64( V7, V6 ); \
V[7] = mm128_alignr_64( V6, V7 ); \
}
#else

View File

@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
be32enc(&endiandata[19], nonce);
myriad_hash(hash, endiandata);
if (hash[7] <= Htarg )
if ( fulltest(hash, ptarget) && !opt_benchmark )
if (hash[7] <= Htarg && fulltest(hash, ptarget))
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;

View File

@@ -19,34 +19,26 @@
*/
#include <string.h>
#include <emmintrin.h>
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#if defined(__AVX512VL__)
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi32( a1, b, 1 ); \
a1 = _mm_alignr_epi32( b, a1, 1 ); \
}
#elif defined(__SSE4_1__)
#if defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#else
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif

View File

@@ -24,45 +24,6 @@ typedef union {
#endif
} allium_16way_ctx_holder;
static uint32_t allium_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
static __m512i allium_16way_block0_hash[8] __attribute__ ((aligned (64)));
static __m512i allium_16way_block_buf[16] __attribute__ ((aligned (64)));
int allium_16way_prehash( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
// Interleave hash for second block prehash.
allium_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
allium_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
allium_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
allium_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
allium_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
allium_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
allium_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
allium_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of last 16 bytes of data,
// excluding the nonce.
allium_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
allium_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
allium_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( allium_16way_midstate_vars,
allium_16way_block0_hash, allium_16way_block_buf );
return 1;
}
static void allium_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -239,6 +200,11 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -250,19 +216,31 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
pthread_rwlock_rdlock( &g_work_lock );
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
memcpy( midstate_vars, allium_16way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, allium_16way_block0_hash, sizeof block0_hash );
memcpy( block_buf, allium_16way_block_buf, sizeof block_buf );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
pthread_rwlock_unlock( &g_work_lock );
// fill in the nonces
block_buf[3] =
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -293,44 +271,6 @@ typedef union {
#endif
} allium_8way_ctx_holder;
static uint32_t allium_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
static __m256i allium_8way_block0_hash[8] __attribute__ ((aligned (64)));
static __m256i allium_8way_block_buf[16] __attribute__ ((aligned (64)));
int allium_8way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
allium_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
allium_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
allium_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
allium_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
allium_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
allium_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
allium_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
allium_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of the last 16 bytes,
// excepting the nonces.
allium_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
allium_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
allium_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( allium_8way_midstate_vars,
allium_8way_block0_hash, allium_8way_block_buf );
return 1;
}
static void allium_8way_hash( void *hash, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -446,6 +386,11 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
@@ -455,17 +400,29 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
pthread_rwlock_rdlock( &g_work_lock );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
memcpy( midstate_vars, allium_8way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, allium_8way_block0_hash, sizeof block0_hash );
memcpy( block_buf, allium_8way_block_buf, sizeof block_buf );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -481,7 +438,6 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
n += 8;
block_buf[ 3] = _mm256_add_epi32( block_buf[ 3], eight );
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;

View File

@@ -131,12 +131,10 @@ bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->prehash = (void*)&lyra2z_16way_prehash;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->prehash = (void*)&lyra2z_8way_prehash;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
@@ -177,10 +175,8 @@ bool register_lyra2h_algo( algo_gate_t* gate )
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_16WAY)
gate->prehash = (void*)&allium_16way_prehash;
gate->scanhash = (void*)&scanhash_allium_16way;
#elif defined (ALLIUM_8WAY)
gate->prehash = (void*)&allium_8way_prehash;
gate->scanhash = (void*)&scanhash_allium_8way;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;

View File

@@ -5,6 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
@@ -101,7 +102,6 @@ bool init_lyra2rev2_ctx();
//void lyra2z_16way_hash( void *state, const void *input );
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int lyra2z_16way_prehash ( struct work *work );
bool lyra2z_16way_thread_init();
#elif defined(LYRA2Z_8WAY)
@@ -110,7 +110,6 @@ bool lyra2z_16way_thread_init();
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
int lyra2z_8way_prehash ( struct work *work );
#elif defined(LYRA2Z_4WAY)
@@ -166,13 +165,11 @@ bool register_allium_algo( algo_gate_t* gate );
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int allium_16way_prehash ( struct work *work );
#elif defined(ALLIUM_8WAY)
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int allium_8way_prehash ( struct work *work );
#else

View File

@@ -14,44 +14,6 @@ bool lyra2z_16way_thread_init()
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
}
static uint32_t lyra2z_16way_midstate_vars[16*16] __attribute__ ((aligned (64)));
static __m512i lyra2z_16way_block0_hash[8] __attribute__ ((aligned (64)));
static __m512i lyra2z_16way_block_buf[16] __attribute__ ((aligned (64)));
int lyra2z_16way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
lyra2z_16way_block0_hash[0] = _mm512_set1_epi32( phash[0] );
lyra2z_16way_block0_hash[1] = _mm512_set1_epi32( phash[1] );
lyra2z_16way_block0_hash[2] = _mm512_set1_epi32( phash[2] );
lyra2z_16way_block0_hash[3] = _mm512_set1_epi32( phash[3] );
lyra2z_16way_block0_hash[4] = _mm512_set1_epi32( phash[4] );
lyra2z_16way_block0_hash[5] = _mm512_set1_epi32( phash[5] );
lyra2z_16way_block0_hash[6] = _mm512_set1_epi32( phash[6] );
lyra2z_16way_block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave 12 of last 16 bytes of data
// excepting the nonce.
lyra2z_16way_block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
lyra2z_16way_block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
lyra2z_16way_block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( lyra2z_16way_midstate_vars,
lyra2z_16way_block0_hash, lyra2z_16way_block_buf );
return 1;
}
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -129,6 +91,11 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -140,18 +107,30 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
pthread_rwlock_rdlock( &g_work_lock );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
memcpy( midstate_vars, lyra2z_16way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, lyra2z_16way_block0_hash, sizeof block0_hash );
memcpy( block_buf, lyra2z_16way_block_buf, sizeof block_buf );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm512_set1_epi32( pdata[16] );
block_buf[ 1] = _mm512_set1_epi32( pdata[17] );
block_buf[ 2] = _mm512_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -178,44 +157,6 @@ bool lyra2z_8way_thread_init()
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static uint32_t lyra2z_8way_midstate_vars[16*8] __attribute__ ((aligned (64)));
static __m256i lyra2z_8way_block0_hash[8] __attribute__ ((aligned (64)));
static __m256i lyra2z_8way_block_buf[16] __attribute__ ((aligned (64)));
int lyra2z_8way_prehash ( struct work *work )
{
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
lyra2z_8way_block0_hash[0] = _mm256_set1_epi32( phash[0] );
lyra2z_8way_block0_hash[1] = _mm256_set1_epi32( phash[1] );
lyra2z_8way_block0_hash[2] = _mm256_set1_epi32( phash[2] );
lyra2z_8way_block0_hash[3] = _mm256_set1_epi32( phash[3] );
lyra2z_8way_block0_hash[4] = _mm256_set1_epi32( phash[4] );
lyra2z_8way_block0_hash[5] = _mm256_set1_epi32( phash[5] );
lyra2z_8way_block0_hash[6] = _mm256_set1_epi32( phash[6] );
lyra2z_8way_block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
lyra2z_8way_block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
lyra2z_8way_block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
lyra2z_8way_block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( lyra2z_8way_midstate_vars,
lyra2z_8way_block0_hash, lyra2z_8way_block_buf );
return 1;
}
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
const void *midhash, const void *block )
{
@@ -260,6 +201,11 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint64_t *ptarget = (uint64_t*)work->target;
const uint32_t first_nonce = pdata[19];
@@ -269,14 +215,23 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
pthread_rwlock_rdlock( &g_work_lock );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
memcpy( midstate_vars, lyra2z_8way_midstate_vars, sizeof midstate_vars );
memcpy( block0_hash, lyra2z_8way_block0_hash, sizeof block0_hash );
memcpy( block_buf, lyra2z_8way_block_buf, sizeof block_buf );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
pthread_rwlock_unlock( &g_work_lock );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[ 0] = _mm256_set1_epi32( pdata[16] );
block_buf[ 1] = _mm256_set1_epi32( pdata[17] );
block_buf[ 2] = _mm256_set1_epi32( pdata[18] );
block_buf[ 3] =
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

View File

@@ -146,25 +146,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
__m128i t; \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
t = mm128_alignr_64( s7, s6, 1 ); \
s6 = mm128_alignr_64( s6, s7, 1 ); \
s7 = t; \
t = mm128_alignr_64( s2, s3, 1 ); \
s2 = mm128_alignr_64( s3, s2, 1 ); \
s3 = t; \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
t = mm128_alignr_64( s6, s7, 1 ); \
s6 = mm128_alignr_64( s7, s6, 1 ); \
s7 = t; \
t = mm128_alignr_64( s3, s2, 1 ); \
s2 = mm128_alignr_64( s2, s3, 1 ); \
s3 = t; \
}
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -7,16 +7,8 @@
#if defined (SKEIN_8WAY)
static skein512_8way_context skein512_8way_ctx
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
static uint32_t skein_8way_vdata[20*8] __attribute__ ((aligned (64)));
int skein_8way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_8x64( skein_8way_vdata, work->data );
skein512_8way_prehash64( &skein512_8way_ctx, skein_8way_vdata );
return 1;
}
void skeinhash_8way( void *state, const void *input )
{
@@ -37,27 +29,25 @@ void skeinhash_8way( void *state, const void *input )
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash_d7 = &(hash[7*8]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein_8way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
uint32_t vdata[20*8] __attribute__ ((aligned (128)));
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash_d7 = &(hash[7*8]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &skein512_8way_ctx, vdata );
do
{
skeinhash_8way( hash, vdata );
@@ -84,16 +74,8 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
#elif defined (SKEIN_4WAY)
static skein512_4way_context skein512_4way_ctx
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
static uint32_t skein_4way_vdata[20*4] __attribute__ ((aligned (64)));
int skein_4way_prehash( struct work *work )
{
mm256_bswap32_intrlv80_4x64( skein_4way_vdata, work->data );
skein512_4way_prehash64( &skein512_4way_ctx, skein_4way_vdata );
return 1;
}
void skeinhash_4way( void *state, const void *input )
{
@@ -136,24 +118,23 @@ void skeinhash_4way( void *state, const void *input )
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash_d7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t targ_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &skein512_4way_ctx, vdata );
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do

View File

@@ -7,12 +7,10 @@ bool register_skein_algo( algo_gate_t* gate )
#if defined (SKEIN_8WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->scanhash = (void*)&scanhash_skein_8way;
gate->prehash = (void*)&skein_8way_prehash;
gate->hash = (void*)&skeinhash_8way;
#elif defined (SKEIN_4WAY)
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->prehash = (void*)&skein_4way_prehash;
gate->hash = (void*)&skeinhash_4way;
#else
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
@@ -27,12 +25,10 @@ bool register_skein2_algo( algo_gate_t* gate )
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined (SKEIN_8WAY)
gate->scanhash = (void*)&scanhash_skein2_8way;
// gate->hash = (void*)&skein2hash_8way;
gate->prehash = (void*)&skein2_8way_prehash;
gate->hash = (void*)&skein2hash_8way;
#elif defined (SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
// gate->hash = (void*)&skein2hash_4way;
gate->prehash = (void*)&skein2_4way_prehash;
gate->hash = (void*)&skein2hash_4way;
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;

View File

@@ -14,24 +14,20 @@
void skeinhash_8way( void *output, const void *input );
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int skein_8way_prehash( struct work * );
void skein2hash_8way( void *output, const void *input );
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
int skein2_8way_prehash( struct work * );
#elif defined(SKEIN_4WAY)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int skein_4way_prehash( struct work * );
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t* hashes_done, struct thr_info *mythr );
int skein2_4way_prehash( struct work * );
#else

View File

@@ -31,19 +31,18 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id;
int thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);
skeinhash(hash64, endiandata);
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -5,17 +5,9 @@
#if defined(SKEIN_8WAY)
static skein512_8way_context skein512_8way_ctx __attribute__ ((aligned (64)));
static uint32_t skein2_8way_vdata[20*8] __attribute__ ((aligned (64)));
static __thread skein512_8way_context skein512_8way_ctx
__attribute__ ((aligned (64)));
int skein2_8way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_8x64( skein2_8way_vdata, work->data );
skein512_8way_prehash64( &skein512_8way_ctx, skein2_8way_vdata );
return 1;
}
/* not used
void skein2hash_8way( void *output, const void *input )
{
uint64_t hash[16*8] __attribute__ ((aligned (128)));
@@ -25,7 +17,6 @@ void skein2hash_8way( void *output, const void *input )
skein512_8way_final16( &ctx, hash, input + (64*8) );
skein512_8way_full( &ctx, output, hash, 64 );
}
*/
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -45,14 +36,11 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
skein512_8way_context ctx;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein2_8way_vdata, sizeof vdata );
memcpy( &ctx, &skein512_8way_ctx, sizeof ctx );
pthread_rwlock_unlock( &g_work_lock );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &ctx, vdata );
do
{
skein512_8way_final16( &ctx, hash, vdata + (16*8) );
@@ -79,18 +67,10 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
}
#elif defined(SKEIN_4WAY)
static skein512_4way_context skein512_4way_ctx __attribute__ ((aligned (64)));
static uint32_t skein2_4way_vdata[20*4] __attribute__ ((aligned (64)));
int skein2_4way_prehash( struct work *work )
{
mm256_bswap32_intrlv80_4x64( skein2_4way_vdata, work->data );
skein512_4way_prehash64( &skein512_4way_ctx, skein2_4way_vdata );
return 1;
}
/* not used
static __thread skein512_4way_context skein512_4way_ctx
__attribute__ ((aligned (64)));
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
@@ -100,7 +80,6 @@ void skein2hash_4way( void *output, const void *input )
skein512_4way_final16( &ctx, hash, input + (64*4) );
skein512_4way_full( &ctx, output, hash, 64 );
}
*/
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -120,11 +99,8 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
skein512_4way_context ctx;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, skein2_4way_vdata, sizeof vdata );
memcpy( &ctx, &skein512_4way_ctx, sizeof ctx );
pthread_rwlock_unlock( &g_work_lock );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &ctx, vdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do

View File

@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
sph_skein512_close(&ctx_skein, hash);
memcpy(output, hash, 32);
}
int scanhash_skein2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id;
int thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
do {
be32enc(&endiandata[19], n);
skein2hash(hash64, endiandata);
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -25,7 +25,7 @@ static void hex_getAlgoString(const uint32_t* prevblock, char *output)
static __thread x16r_context_overlay hex_ctx;
int hex_hash( void* output, const void* input, const int thrid )
int hex_hash( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;

View File

@@ -72,7 +72,7 @@ struct TortureGarden
// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
static int get_hash( void *output, const void *input, TortureGarden *garden,
unsigned int algo, const int thr_id )
unsigned int algo, int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;
@@ -233,7 +233,7 @@ bool initialize_torture_garden()
}
// Produce a 32-byte hash from 80-byte input data
int minotaur_hash( void *output, const void *input, const int thr_id )
int minotaur_hash( void *output, const void *input, int thr_id )
{
unsigned char hash[64] __attribute__ ((aligned (64)));
int rc = 1;

View File

@@ -19,7 +19,7 @@
// Perform midstate prehash of hash functions with block size <= 72 bytes,
// 76 bytes for hash functions that operate on 32 bit data.
void x16r_8way_do_prehash( void *vdata, const void *pdata )
void x16r_8way_prehash( void *vdata, void *pdata )
{
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -106,18 +106,11 @@ void x16r_8way_do_prehash( void *vdata, const void *pdata )
}
}
int x16r_8way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_8way_do_prehash( x16r_8way_vdata, work->data );
return 1;
}
// Perform the full x16r hash and returns 512 bit intermediate hash.
// Called by wrapper hash function to optionally continue hashing and
// convert to final hash.
int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
int x16r_8way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*8] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (16)));
@@ -478,7 +471,7 @@ int x16r_8way_hash_generic( void* output, const void* input, const int thrid )
// x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
// hash to 256 bit final hash.
int x16r_8way_hash( void* output, const void* input, const int thrid )
int x16r_8way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*8] __attribute__ ((aligned (128)));
if ( !x16r_8way_hash_generic( hash, input, thrid ) )
@@ -502,6 +495,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -514,16 +508,27 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, x16r_8way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( algo_gate.hash( hash, vdata, thr_id ) );
if( x16r_8way_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -541,7 +546,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
#elif defined (X16R_4WAY)
void x16r_4way_do_prehash( void *vdata, const void *pdata )
void x16r_4way_prehash( void *vdata, void *pdata )
{
uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
@@ -622,14 +627,7 @@ void x16r_4way_do_prehash( void *vdata, const void *pdata )
}
}
int x16r_4way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_4way_do_prehash( x16r_4way_vdata, work->data );
return 1;
}
int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
int x16r_4way_hash_generic( void* output, const void* input, int thrid )
{
uint32_t vhash[20*4] __attribute__ ((aligned (128)));
uint32_t hash0[20] __attribute__ ((aligned (32)));
@@ -637,14 +635,13 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
uint32_t hash2[20] __attribute__ ((aligned (32)));
uint32_t hash3[20] __attribute__ ((aligned (32)));
x16r_4way_context_overlay ctx;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
for ( int i = 0; i < 16; i++ )
@@ -908,7 +905,7 @@ int x16r_4way_hash_generic( void* output, const void* input, const int thrid )
return 1;
}
int x16r_4way_hash( void* output, const void* input, const int thrid )
int x16r_4way_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64*4] __attribute__ ((aligned (64)));
if ( !x16r_4way_hash_generic( hash, input, thrid ) )
@@ -927,6 +924,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -939,15 +937,25 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0cff;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, x16r_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "Hash order %s Ntime %08x", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( algo_gate.hash( hash, vdata, thr_id ) );
if ( x16r_4way_hash( hash, vdata, thr_id ) );
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -1,44 +1,26 @@
#include "x16r-gate.h"
#include "algo/sha/sha256d.h"
char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = {0};
__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
void (*x16r_gate_get_hash_order) ( const struct work *, char * ) = NULL;
void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
#if defined (X16R_8WAY)
x16r_8way_context_overlay x16r_ctx;
uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
__thread x16r_8way_context_overlay x16r_ctx;
#elif defined (X16R_4WAY)
x16r_4way_context_overlay x16r_ctx;
uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
__thread x16r_4way_context_overlay x16r_ctx;
#endif
#if defined (X16RV2_8WAY)
__thread x16r_context_overlay x16_ctx;
x16rv2_8way_context_overlay x16rv2_ctx;
#elif defined (X16RV2_4WAY)
x16rv2_4way_context_overlay x16rv2_ctx;
#endif
x16r_context_overlay x16_ctx;
uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
void x16r_get_hash_order( const struct work *work, char *hash_order )
void x16r_getAlgoString( const uint8_t* prevblock, char *output )
{
char *sptr = hash_order;
const uint32_t *pdata = work->data;
uint8_t prevblock[16];
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
char *sptr = output;
for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
{
uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256)
@@ -50,51 +32,38 @@ void x16r_get_hash_order( const struct work *work, char *hash_order )
sptr++;
}
*sptr = '\0';
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
}
void x16s_get_hash_order( const struct work *work, char *hash_order )
void x16s_getAlgoString( const uint8_t* prevblock, char *output )
{
const uint32_t *pdata = work->data;
uint8_t prevblock[16];
((uint32_t*)prevblock)[0] = bswap_32( pdata[1] );
((uint32_t*)prevblock)[1] = bswap_32( pdata[2] );
strcpy( hash_order, "0123456789ABCDEF" );
strcpy( output, "0123456789ABCDEF" );
for ( int i = 0; i < 16; i++ )
{
uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
int offset = algoDigit;
// insert the nth character at the front
char oldVal = hash_order[ offset ];
char oldVal = output[offset];
for( int j = offset; j-- > 0; )
hash_order[ j+1 ] = hash_order[ j ];
hash_order[ 0 ] = oldVal;
output[j+1] = output[j];
output[0] = oldVal;
}
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s", x16r_hash_order );
}
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
};
@@ -102,20 +71,17 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16RV2_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16rv2_8way_prehash;
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16RV2_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16rv2_4way_prehash;
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16rv2_prehash;
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16r_get_hash_order;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
};
@@ -124,19 +90,16 @@ bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
};
@@ -145,33 +108,30 @@ bool register_x16s_algo( algo_gate_t* gate )
//
// X16RT
void x16rt_get_hash_order( const struct work * work, char * hash_order )
{
uint32_t _ALIGN(64) timehash[8*8];
const uint32_t ntime = bswap_32( work->data[17] );
const int32_t masked_ntime = ntime & 0xffffff80;
uint8_t* data = (uint8_t*)timehash;
char *sptr = hash_order;
sha256d( (unsigned char*)timehash, (const unsigned char*)( &masked_ntime ),
sizeof( masked_ntime ) );
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash )
{
int32_t maskedTime = timeStamp & 0xffffff80;
sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ),
sizeof( maskedTime ) );
}
for ( uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
{
void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)timeHash;
for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if ( algoDigit >= 10 )
sprintf( sptr, "%c", 'A' + (algoDigit - 10) );
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf( sptr, "%u", (uint32_t) algoDigit );
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
if ( !opt_quiet )
applog( LOG_INFO, "Hash order %s, ntime %08x, time hash %08x",
hash_order, ntime, timehash );
}
void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -262,19 +222,15 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
opt_target_factor = 256.0;
return true;
@@ -283,20 +239,16 @@ bool register_x16rt_algo( algo_gate_t* gate )
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16rt_get_hash_order;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -323,23 +275,20 @@ bool register_hex_algo( algo_gate_t* gate )
bool register_x21s_algo( algo_gate_t* gate )
{
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->prehash = (void*)&x16r_8way_prehash;
gate->scanhash = (void*)&scanhash_x21s_8way;
gate->hash = (void*)&x21s_8way_hash;
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->prehash = (void*)&x16r_4way_prehash;
gate->scanhash = (void*)&scanhash_x21s_4way;
gate->hash = (void*)&x21s_4way_hash;
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->prehash = (void*)&x16r_prehash;
gate->scanhash = (void*)&scanhash_x21s;
gate->hash = (void*)&x21s_hash;
gate->miner_thread_init = (void*)&x21s_thread_init;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
x16r_gate_get_hash_order = (void*)&x16s_get_hash_order;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
};

View File

@@ -21,7 +21,6 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/tiger/sph_tiger.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -58,11 +57,13 @@
#define X16R_8WAY 1
#define X16RV2_8WAY 1
#define X16RT_8WAY 1
#define X21S_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16RV2_4WAY 1
#define X16RT_4WAY 1
#define X21S_4WAY 1
#define X16R_4WAY 1
@@ -88,29 +89,23 @@ enum x16r_Algo {
X16R_HASH_FUNC_COUNT
};
extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
//extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
extern char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
extern void (*x16r_gate_get_hash_order) ( const struct work *, char * );
// x16r, x16rv2
void x16r_get_hash_order( const struct work *, char * );
// x16s, x21s
void x16s_get_hash_order( const struct work *, char * );
// x16rt
void x16rt_get_hash_order( const struct work *, char * );
extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
void x16r_getAlgoString( const uint8_t *prevblock, char *output );
void x16s_getAlgoString( const uint8_t *prevblock, char *output );
void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );
bool register_x16r_algo( algo_gate_t* gate );
bool register_x16rv2_algo( algo_gate_t* gate );
bool register_x16s_algo( algo_gate_t* gate );
bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex_algo( algo_gate_t* gate );
bool register_x21s_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
// x16r, x16s, x16rt
// x16r, x16s
#if defined(X16R_8WAY)
union _x16r_8way_context_overlay
@@ -141,15 +136,15 @@ union _x16r_8way_context_overlay
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
extern x16r_8way_context_overlay x16r_ctx;
extern uint32_t x16r_8way_vdata[24*8] __attribute__ ((aligned (64)));
extern __thread x16r_8way_context_overlay x16r_ctx;
void x16r_8way_do_prehash( void *, const void * );
int x16r_8way_prehash( struct work * );
int x16r_8way_hash_generic( void *, const void *, const int );
int x16r_8way_hash( void *, const void *, const int );
void x16r_8way_prehash( void *, void * );
int x16r_8way_hash_generic( void *, const void *, int );
int x16r_8way_hash( void *, const void *, int );
int scanhash_x16r_8way( struct work *, uint32_t ,
uint64_t *, struct thr_info * );
extern __thread x16r_8way_context_overlay x16r_ctx;
#elif defined(X16R_4WAY)
@@ -182,15 +177,14 @@ union _x16r_4way_context_overlay
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
extern x16r_4way_context_overlay x16r_ctx;
extern uint32_t x16r_4way_vdata[24*4] __attribute__ ((aligned (64)));
extern __thread x16r_4way_context_overlay x16r_ctx;
void x16r_4way_do_prehash( void *, const void * );
int x16r_4way_prehash( struct work * );
int x16r_4way_hash_generic( void *, const void *, const int );
int x16r_4way_hash( void *, const void *, const int );
void x16r_4way_prehash( void *, void * );
int x16r_4way_hash_generic( void *, const void *, int );
int x16r_4way_hash( void *, const void *, int );
int scanhash_x16r_4way( struct work *, uint32_t,
uint64_t *, struct thr_info * );
extern __thread x16r_4way_context_overlay x16r_ctx;
#endif
@@ -223,113 +217,80 @@ union _x16r_context_overlay
typedef union _x16r_context_overlay x16r_context_overlay;
extern x16r_context_overlay x16_ctx;
extern uint32_t x16r_edata[24] __attribute__ ((aligned (32)));
extern __thread x16r_context_overlay x16_ctx;
void x16r_do_prehash( const void * );
int x16r_prehash( const struct work * );
int x16r_hash_generic( void *, const void *, const int );
int x16r_hash( void *, const void *, const int );
void x16r_prehash( void *, void * );
int x16r_hash_generic( void *, const void *, int );
int x16r_hash( void *, const void *, int );
int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
// x16Rv2
#if defined(X16RV2_8WAY)
union _x16rv2_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sph_tiger_context tiger;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
echo_4way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
extern x16rv2_8way_context_overlay x16rv2_ctx;
int x16rv2_8way_prehash( struct work * );
int x16rv2_8way_hash( void *state, const void *input, const int thrid );
//int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
int x16rv2_8way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16RV2_4WAY)
union _x16rv2_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
echo_2way_context echo;
int x16rv2_4way_hash( void *state, const void *input, int thrid );
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
int x16rv2_hash( void *state, const void *input, int thr_id );
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
extern x16rv2_4way_context_overlay x16rv2_ctx;
// x16rt, veil
#if defined(X16R_8WAY)
int x16rv2_4way_hash( void *state, const void *input, const int thrid );
int x16rv2_4way_prehash( struct work * );
//void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
//void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
int x16rv2_hash( void *state, const void *input, const int thr_id );
int x16rv2_prehash( const struct work * );
//void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
// x21s
#if defined(X16R_8WAY)
int x21s_8way_hash( void *state, const void *input, const int thrid );
int x21s_8way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_8way_thread_init();
#elif defined(X16R_4WAY)
int x21s_4way_hash( void *state, const void *input, const int thrid );
int x21s_4way_hash( void *state, const void *input, int thrid );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#else
int x21s_hash( void *state, const void *input, const int thr_id );
int x21s_hash( void *state, const void *input, int thr_id );
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_thread_init();
#endif
//void hex_hash( void *state, const void *input );
int scanhash_hex( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -10,7 +10,7 @@
#include <stdlib.h>
#include <string.h>
void x16r_do_prehash( const void *edata )
void x16r_prehash( void *edata, void *pdata )
{
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
@@ -48,7 +48,7 @@ void x16r_do_prehash( const void *edata )
}
}
int x16r_hash_generic( void* output, const void* input, const int thrid )
int x16r_hash_generic( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x16r_context_overlay ctx;
@@ -192,15 +192,7 @@ int x16r_hash_generic( void* output, const void* input, const int thrid )
return true;
}
int x16r_prehash( const struct work *work )
{
mm128_bswap32_80( x16r_edata, work->data );
x16r_gate_get_hash_order( work, x16r_hash_order );
x16r_do_prehash( x16r_edata );
return 1;
}
int x16r_hash( void* output, const void* input, const int thrid )
int x16r_hash( void* output, const void* input, int thrid )
{
uint8_t hash[64] __attribute__ ((aligned (64)));
if ( !x16r_hash_generic( hash, input, thrid ) )
@@ -213,8 +205,8 @@ int x16r_hash( void* output, const void* input, const int thrid )
int scanhash_x16r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) edata[20];
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -224,14 +216,24 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
pthread_rwlock_rdlock( &g_work_lock );
memcpy( edata, x16r_edata, sizeof edata );
pthread_rwlock_unlock( &g_work_lock );
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
do
{
edata[19] = nonce;
if ( algo_gate.hash( hash32, edata, thr_id ) )
if ( x16r_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );

113
algo/x16/x16rt-4way.c Normal file
View File

@@ -0,0 +1,113 @@
#include "x16r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined (X16R_8WAY)
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[8*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16R_4WAY)
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "Hash order %s, Ntime %08x, time hash %08x",
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

53
algo/x16/x16rt.c Normal file
View File

@@ -0,0 +1,53 @@
#include "x16r-gate.h"
#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t _ALIGN(64) timeHash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, swab32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata );
do
{
edata[19] = nonce;
if ( x16r_hash( hash32, edata, thr_id ) )
if ( valid_hash( hash32, ptarget ) && !bench )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif // !defined(X16R_8WAY) && !defined(X16R_4WAY)

View File

@@ -12,73 +12,37 @@
#if defined (X16RV2_8WAY)
void x16rv2_8way_do_prehash( void *vdata, void *pdata )
union _x16rv2_8way_context_overlay
{
uint32_t vdata32[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sph_tiger_context tiger;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
echo_4way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
} __attribute__ ((aligned (64)));
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
static __thread x16rv2_8way_context_overlay x16rv2_ctx;
switch ( algo )
{
case JH:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16rv2_ctx.jh );
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16rv2_ctx.skein );
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata32, pdata );
shabal512_8way_init( &x16rv2_ctx.shabal );
shabal512_8way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_8x32_8x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
default:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
}
}
int x16rv2_8way_prehash( struct work *work )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16rv2_8way_do_prehash( x16r_8way_vdata, work->data );
return 1;
}
int x16rv2_8way_hash( void* output, const void* input, const int thrid )
int x16rv2_8way_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (32)));
@@ -593,28 +557,50 @@ int x16rv2_8way_hash( void* output, const void* input, const int thrid )
return 1;
}
#elif defined (X16RV2_4WAY)
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
void x16rv2_4way_do_prehash( void *vdata, void *pdata )
{
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16rv2_ctx.jh );
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16rv2_ctx.jh );
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
@@ -622,45 +608,100 @@ void x16rv2_4way_do_prehash( void *vdata, void *pdata )
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16rv2_ctx.skein );
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
shabal512_4way_init( &x16rv2_ctx.shabal );
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_4x32_4x64( vdata, vdata32, 640 );
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
shabal512_8way_init( &x16rv2_ctx.shabal );
shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
rintrlv_8x32_8x64( vdata, vdata2, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
default:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
}
}
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
int x16rv2_4way_prehash( struct work *work )
#elif defined (X16RV2_4WAY)
union _x16rv2_4way_context_overlay
{
x16r_gate_get_hash_order( work, x16r_hash_order );
x16rv2_4way_do_prehash( x16r_4way_vdata, work->data );
return 1;
blake512_4way_context blake;
bmw512_4way_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
echo_2way_context echo;
#else
hashState_groestl groestl;
shavite512_context shavite;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
static __thread x16rv2_4way_context_overlay x16rv2_ctx;
// Pad the 24 bytes tiger hash to 64 bytes
inline void padtiger512( uint32_t* hash )
{
for ( int i = 6; i < 16; i++ ) hash[i] = 0;
}
int x16rv2_4way_hash( void* output, const void* input, int thrid )
@@ -1007,4 +1048,107 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
return 1;
}
int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20];
uint32_t bedata1[2];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0fff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
// Do midstate prehash on hash functions with block size <= 64 bytes.
const char elem = x16r_hash_order[0];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case JH:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16rv2_ctx.jh );
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
case SHA_512:
mm128_bswap32_80( edata, pdata );
sph_tiger_init( &x16rv2_ctx.tiger );
sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
break;
case CUBEHASH:
mm128_bswap32_80( edata, pdata );
cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
break;
case SHABAL:
mm128_bswap32_intrlv80_4x32( vdata32, pdata );
shabal512_4way_init( &x16rv2_ctx.shabal );
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_4x32_4x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:
mm128_bswap32_80( edata, pdata );
sph_whirlpool_init( &x16rv2_ctx.whirlpool );
sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
default:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
}
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16rv2_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -43,16 +43,9 @@ inline void padtiger512(uint32_t* hash) {
for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
}
// no prehash
int x16rv2_prehash( const struct work *work )
int x16rv2_hash( void* output, const void* input, int thrid )
{
x16r_gate_get_hash_order( work, x16r_hash_order );
return 1;
}
int x16rv2_hash( void* output, const void* input, const int thrid )
{
uint32_t _ALIGN(32) hash[16];
uint32_t _ALIGN(128) hash[16];
x16rv2_context_overlay ctx;
void *in = (void*) input;
int size = 80;
@@ -177,4 +170,52 @@ int x16rv2_hash( void* output, const void* input, const int thrid )
return 1;
}
int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)",
x16r_hash_order, ntime );
}
if ( bench ) ptarget[7] = 0x0cff;
do
{
edata[19] = nonce;
if ( x16rv2_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
#endif

View File

@@ -30,7 +30,7 @@ union _x21s_8way_context_overlay
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
int x21s_8way_hash( void* output, const void* input, const int thrid )
int x21s_8way_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint8_t shash[64*8] __attribute__ ((aligned (64)));
@@ -129,6 +129,66 @@ int x21s_8way_hash( void* output, const void* input, const int thrid )
return 1;
}
int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<3];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_8way_prehash( vdata, pdata );
*noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_8way_hash( hash, vdata, thr_id ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_8way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
@@ -155,7 +215,7 @@ union _x21s_4way_context_overlay
typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
int x21s_4way_hash( void* output, const void* input, const int thrid )
int x21s_4way_hash( void* output, const void* input, int thrid )
{
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
uint8_t shash[64*4] __attribute__ ((aligned (64)));
@@ -231,6 +291,58 @@ int x21s_4way_hash( void* output, const void* input, const int thrid )
return 1;
}
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[16*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( bench ) ptarget[7] = 0x0cff;
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_4way_prehash( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x21s_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n+i );
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
bool x21s_4way_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols

View File

@@ -27,7 +27,7 @@ union _x21s_context_overlay
};
typedef union _x21s_context_overlay x21s_context_overlay;
int x21s_hash( void* output, const void* input, const int thrid )
int x21s_hash( void* output, const void* input, int thrid )
{
uint32_t _ALIGN(128) hash[16];
x21s_context_overlay ctx;
@@ -57,6 +57,50 @@ int x21s_hash( void* output, const void* input, const int thrid )
return 1;
}
int scanhash_x21s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int thr_id = mythr->id;
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );
do
{
edata[19] = nonce;
if ( x21s_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash32, mythr );
}
nonce++;
} while ( nonce < max_nonce && !(*restart) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
bool x21s_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols

View File

@@ -31,26 +31,8 @@
yespower_params_t yespower_params;
// master g_work
sha256_context yespower_sha256_prehash_ctx;
uint32_t _ALIGN(64) yespower_endiandata[20];
// local work
__thread sha256_context sha256_prehash_ctx;
int yespower_sha256_prehash( struct work *work )
{
uint32_t *pdata = work->data;
for ( int k = 0; k < 19; k++ )
be32enc( &yespower_endiandata[k], pdata[k] );
sha256_ctx_init( &yespower_sha256_prehash_ctx );
sha256_update( &yespower_sha256_prehash_ctx, yespower_endiandata, 64 );
return 1;
}
// YESPOWER
int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
@@ -71,15 +53,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( endiandata, yespower_endiandata, sizeof endiandata );
memcpy( &sha256_prehash_ctx, &yespower_sha256_prehash_ctx, sizeof sha256_prehash_ctx );
// pthread_rwlock_unlock( &g_work_lock );
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
// do sha256 prehash
sha256_ctx_init( &sha256_prehash_ctx );
sha256_update( &sha256_prehash_ctx, endiandata, 64 );
do {
if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
@@ -159,7 +140,6 @@ bool register_yespower_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
return true;
@@ -174,7 +154,6 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
yespower_params.perslen = 0;
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
gate->hash = (void*)&yespower_hash;
opt_target_factor = 65536.0;
return true;
@@ -186,7 +165,6 @@ bool register_yescrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
opt_target_factor = 65536.0;
@@ -220,7 +198,6 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 2048;
yespower_params.r = 8;
@@ -234,7 +211,6 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 16;
@@ -248,7 +224,6 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
gate->prehash = (void*)&yespower_sha256_prehash;
yespower_params.version = YESPOWER_0_5;
yespower_params.N = 4096;
yespower_params.r = 32;

View File

@@ -80,8 +80,6 @@ extern yespower_params_t yespower_params;
extern __thread sha256_context sha256_prehash_ctx;
int yespower_sha256_prehash( struct work *work );
/**
* yespower_init_local(local):
* Initialize the thread-local (RAM) data structure. Actual memory allocation