This commit is contained in:
Jay D Dee
2019-06-18 13:15:45 -04:00
parent 71d6b97ee8
commit d6e8d7a46e
75 changed files with 1790 additions and 781 deletions

View File

@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
compiler version, to CFLAGS:
"-march=native" or "-march=znver1" or "-msha".
Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
to override multiway AVX2 on algos with sha256, and use SHA instead.
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer
Static builds should only considered in a homogeneous HW and SW environment.

View File

@@ -131,6 +131,7 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2h-4way.c \
algo/lyra2/allium-4way.c \
algo/lyra2/allium.c \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
@@ -147,6 +148,9 @@ cpuminer_SOURCES = \
algo/quark/anime-gate.c \
algo/quark/anime.c \
algo/quark/anime-4way.c \
algo/quark/hmq1725-gate.c \
algo/quark/hmq1725-4way.c \
algo/quark/hmq1725.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
@@ -257,7 +261,6 @@ cpuminer_SOURCES = \
algo/x17/xevan-gate.c \
algo/x17/xevan.c \
algo/x17/xevan-4way.c \
algo/x17/hmq1725.c \
algo/x17/sonoa-gate.c \
algo/x17/sonoa-4way.c \
algo/x17/sonoa.c \

View File

@@ -29,7 +29,7 @@ cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-march=westmere" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
cpuminer-zen "-march=znver1 -DRYZEN_" Ryzen
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
If you like this software feel free to donate:

View File

@@ -38,9 +38,17 @@ supported.
Change Log
----------
v3.9.4
Faster AVX2 for lyra2v3, quark, anime.
Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
Faster skein2 with 4way AVX2 enabled.
Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
Ongoing restructuring.
v3.9.3.1
Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
Fixed x16r algo 25% invalid share reject rate. The bug may have also
affected other algos.

View File

@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
void free_memory(const argon2_context *context, uint8_t *memory,
size_t num, size_t size) {
size_t memory_size = num*size;
clear_internal_memory(memory, memory_size);
// clear_internal_memory(memory, memory_size);
if (context->free_cbk) {
(context->free_cbk)(memory, memory_size);
} else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
int FLAG_clear_internal_memory = 0;
void clear_internal_memory(void *v, size_t n) {
if (FLAG_clear_internal_memory && v) {
secure_wipe_memory(v, n);
// secure_wipe_memory(v, n);
}
}
@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
context->pwdlen);
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
secure_wipe_memory(context->pwd, context->pwdlen);
// secure_wipe_memory(context->pwd, context->pwdlen);
context->pwdlen = 0;
}
}
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
context->secretlen);
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
secure_wipe_memory(context->secret, context->secretlen);
// secure_wipe_memory(context->secret, context->secretlen);
context->secretlen = 0;
}
}

View File

@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
}
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =
void bmw256_8way_init( bmw256_8way_context *ctx )
{
ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
ctx->H[10] = _mm256_set1_epi32( IV256[10] );
ctx->H[11] = _mm256_set1_epi32( IV256[11] );
ctx->H[12] = _mm256_set1_epi32( IV256[12] );
ctx->H[13] = _mm256_set1_epi32( IV256[13] );
ctx->H[14] = _mm256_set1_epi32( IV256[14] );
ctx->H[15] = _mm256_set1_epi32( IV256[15] );
ctx->ptr = 0;
ctx->bit_count = 0;
@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
ptr = ctx->ptr;
h1 = ctx->H;
h2 = htmp;
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
vdata = vdata + (clen>>3);
memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
vdata = vdata + (clen>>2);
len -= clen;
ptr += clen;
if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
}
}
ctx->ptr = ptr;
if ( h1 != ctx->H )
memcpy_256( ctx->H, h1, 16 );
}
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
__m256i *buf;
__m256i h1[16], h2[16], *h;
size_t ptr, u, v;
// unsigned z;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
ptr += 8;
buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
ptr += 4;
h = ctx->H;
if ( ptr > (buf_size - 8) )
if ( ptr > (buf_size - 4) )
{
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small_8way( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
buf[ (buf_size - 4) >> 2 ] = m256_zero;
compress_small_8way( buf, h, h2 );
for ( u = 0; u < 16; u ++ )

View File

@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v3_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV3_4WAY)
#if defined (LYRA2REV3_8WAY)
init_lyra2rev3_8way_ctx();;
#elif defined (LYRA2REV3_4WAY)
init_lyra2rev3_4way_ctx();;
#else
init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()
bool register_lyra2rev3_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV3_4WAY)
#if defined (LYRA2REV3_8WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_8way;
gate->hash = (void*)&lyra2rev3_8way_hash;
#elif defined (LYRA2REV3_4WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_4way;
gate->hash = (void*)&lyra2rev3_4way_hash;
#else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_phi2_algo( algo_gate_t* gate )
{
init_phi2_ctx();
// init_phi2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
gate->build_extraheader = (void*)&phi2_build_extraheader;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
#if defined(PHI2_4WAY)
gate->scanhash = (void*)&scanhash_phi2_4way;
#else
init_phi2_ctx();
gate->scanhash = (void*)&scanhash_phi2;
#endif
return true;
}

View File

@@ -5,7 +5,9 @@
#include <stdint.h>
#include "lyra2.h"
//#if defined(__AVX2__)
#if defined(__AVX2__)
#define LYRA2REV3_8WAY
#endif
#if defined(__SSE2__)
#define LYRA2REV3_4WAY
@@ -14,8 +16,14 @@
extern __thread uint64_t* l2v3_wholeMatrix;
bool register_lyra2rev3_algo( algo_gate_t* gate );
#if defined(LYRA2REV3_8WAY)
#if defined(LYRA2REV3_4WAY)
void lyra2rev3_8way_hash( void *state, const void *input );
int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev3_8way_ctx();
#elif defined(LYRA2REV3_4WAY)
void lyra2rev3_4way_hash( void *state, const void *input );
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -142,15 +150,29 @@ bool init_allium_ctx();
/////////////////////////////////////////
#if defined(__AVX2__) && defined(__AES__)
// #define PHI2_4WAY
#endif
bool phi2_has_roots;
bool register_phi2_algo( algo_gate_t* gate );
#if defined(PHI2_4WAY)
void phi2_hash_4way( void *state, const void *input );
int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void init_phi2_ctx();
#else
void phi2_hash( void *state, const void *input );
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_phi2_ctx();
#endif
#endif // LYRA2_GATE_H__

View File

@@ -1,12 +1,138 @@
#include "lyra2-gate.h"
#include <memory.h>
#if defined (LYRA2REV3_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined (LYRA2REV3_8WAY)
typedef struct {
blake256_8way_context blake;
cubehashParam cube;
bmw256_8way_context bmw;
} lyra2v3_8way_ctx_holder;
static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
bool init_lyra2rev3_8way_ctx()
{
blake256_8way_init( &l2v3_8way_ctx.blake );
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
bmw256_8way_init( &l2v3_8way_ctx.bmw );
return true;
}
void lyra2rev3_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
blake256_8way( &ctx.blake, input, 80 );
blake256_8way_close( &ctx.blake, vhash );
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap_intrlv80_8x32( vdata, pdata );
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
lyra2rev3_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined (LYRA2REV3_4WAY)
typedef struct {
blake256_4way_context blake;
cubehashParam cube;

233
algo/lyra2/phi2-4way.c Normal file
View File

@@ -0,0 +1,233 @@
/**
* Phi-2 algo Implementation
*/
#include "lyra2-gate.h"
#if defined(PHI2_4WAY)
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
cubehashParam cube;
jh512_4way_context jh;
hashState_echo echo;
// hashState_echo echo2;
sph_gost512_context gost;
skein512_4way_context skein;
} phi2_ctx_holder;
/*
phi2_ctx_holder phi2_ctx;
void init_phi2_ctx()
{
cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
sph_jh512_init(&phi2_ctx.jh);
init_echo( &phi2_ctx.echo1, 512 );
init_echo( &phi2_ctx.echo2, 512 );
sph_gost512_init(&phi2_ctx.gost);
sph_skein512_init(&phi2_ctx.skein);
};
*/
void phi2_hash_4way( void *state, const void *input )
{
uint32_t hash[4][16] __attribute__ ((aligned (64)));
uint32_t hashA[4][16] __attribute__ ((aligned (64)));
uint32_t hashB[4][16] __attribute__ ((aligned (64)));
uint32_t vhash[4*16] __attribute__ ((aligned (64)));
// unsigned char _ALIGN(128) hash[64];
// unsigned char _ALIGN(128) hashA[64];
// unsigned char _ALIGN(128) hashB[64];
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
// memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
phi2_has_roots ? 144 : 80 );
LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
if ( hash[0][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[0] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
(const BitSequence *)hash[0], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
(const BitSequence *)hash[0], 512 );
}
if ( hash[1][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[1] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
(const BitSequence *)hash[1], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
(const BitSequence *)hash[1], 512 );
}
if ( hash[2][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[2] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
(const BitSequence *)hash[2], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
(const BitSequence *)hash[2], 512 );
}
if ( hash[3][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[3] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
(const BitSequence *)hash[3], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
(const BitSequence *)hash[3], 512 );
}
mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
for (int i=0; i<4; i++)
{
( (uint64_t*)vhash )[i] ^= ( (uint64_t*)vhash )[i+4];
( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
}
// for ( int i = 0; i < 4; i++ )
// casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i ),
// casti_m256i( vhash, i+4 ) );
memcpy( state, vhash, 128 );
}
int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) edata[36];
uint32_t vdata[4][36] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
// Data is not interleaved, but hash is.
// any non-zero data at index 20 or above sets roots true.
// Split up the operations, bswap first, then set roots.
phi2_has_roots = false;
for ( int i=0; i < 36; i++ )
{
be32enc(&edata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
/*
casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 8 ) );
*/
memcpy( vdata[0], edata, 144 );
memcpy( vdata[1], edata, 144 );
memcpy( vdata[2], edata, 144 );
memcpy( vdata[3], edata, 144 );
do {
be32enc( &vdata[0][19], n );
be32enc( &vdata[1][19], n+1 );
be32enc( &vdata[2][19], n+2 );
be32enc( &vdata[3][19], n+3 );
phi2_hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif // PHI2_4WAY

View File

@@ -144,8 +144,8 @@ void init_m7m_ctx()
#define NM7M 5
#define SW_DIVS 5
#define M7_MIDSTATE_LEN 76
int scanhash_m7m_hash( int thr_id, struct work* work,
uint64_t max_nonce, unsigned long *hashes_done )
int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce,
unsigned long *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
uint32_t hash[8] __attribute__((aligned(64)));
uint8_t bhash[7][64] __attribute__((aligned(64)));
uint32_t n = pdata[19] - 1;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t usw_, mpzscale;
const uint32_t first_nonce = pdata[19];
char data_str[161], hash_str[65], target_str[65];

View File

@@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i
}
int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0ffff;

View File

@@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
const uint32_t mask = 8;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
int i;
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
@@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
if ( hash0[0] & mask )
{
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
}
if ( hash1[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
}
if ( hash2[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
}
if ( hash3[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
}
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
if ( mm256_anybits0( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
@@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
@@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
0
};
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
anime_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
#if defined(ANIME_4WAY)
void anime_4way_hash( void *state, const void *input );
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_anime_4way_ctx();
#endif
void anime_hash( void *state, const void *input );
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_anime_ctx();
#endif

View File

@@ -46,20 +46,6 @@ void init_anime_ctx()
void anime_hash( void *state, const void *input )
{
unsigned char hash[128] __attribute__ ((aligned (32)));
/*
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
*/
uint32_t mask = 8;
anime_ctx_holder ctx;
memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input )
}
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,

618
algo/quark/hmq1725-4way.c Normal file
View File

@@ -0,0 +1,618 @@
#include "hmq1725-gate.h"
#if defined(HMQ1725_4WAY)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
union _hmq1725_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
};
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
extern void hmq1725_4way_hash(void *state, const void *input)
{
// why so big? only really need 8, haval thing uses 16.
uint32_t hash0 [32] __attribute__ ((aligned (64)));
uint32_t hash1 [32] __attribute__ ((aligned (64)));
uint32_t hash2 [32] __attribute__ ((aligned (64)));
uint32_t hash3 [32] __attribute__ ((aligned (64)));
uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
const __m256i vmask = _mm256_set1_epi64x( 24 );
const uint32_t mask = 24;
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// first fork, A is groestl serial, B is skein parallel.
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// A
// if ( hash0[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
// }
// if ( hash1[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
// }
// if ( hash2[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
// }
// if ( hash3[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// B
// if ( mm256_any_clr_256( vh_mask ) )
// {
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// second fork, A = blake parallel, B= bmw parallel.
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( mm256_any_set_256( vh_mask ) )
// {
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
// }
// if ( mm256_any_clr_256( vh_mask ) )
// {
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
(const BitSequence *)hash0, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
(const BitSequence *)hash1, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
(const BitSequence *)hash2, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
(const BitSequence *)hash3, 64 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// A= keccak parallel, B= jh parallel
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( mm256_any_set_256( vh_mask ) )
// {
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
// }
// if ( mm256_any_clr_256( vh_mask ) )
// {
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// A is whirlpool serial, B is haval parallel.
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// A
// if ( hash0[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
// }
// if ( hash1[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
// }
// if ( hash2[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
// }
// if ( hash3[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// B
// if ( mm256_any_clr_256( vh_mask ) )
// {
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhashB );
memset( &vhashB[8<<2], 0, 32<<2);
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// shavite & luffa, both serial, select individually.
if ( hash0[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 ); //
sph_shavite512_close( &ctx.shavite, hash0 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
(const BitSequence *)hash0, 64 );
}
if ( hash1[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 ); //
sph_shavite512_close( &ctx.shavite, hash1 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
(const BitSequence *)hash1, 64 );
}
if ( hash2[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 ); //
sph_shavite512_close( &ctx.shavite, hash2 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
(const BitSequence *)hash2, 64 );
}
if ( hash3[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 ); //
sph_shavite512_close( &ctx.shavite, hash3 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
(const BitSequence *)hash3, 64 );
}
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// A echo, B sd both serial
if ( hash0[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
if ( hash1[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
if ( hash2[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
if ( hash3[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// A = fugue serial, B = sha512 prarallel
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( hash0[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
// }
// if ( hash1[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
// }
// if ( hash2[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
// }
// if ( hash3[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// if ( mm256_any_clr_256( vh_mask ) )
// {
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
// A = haval parallel, B = Whirlpool serial
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// if ( mm256_any_set_256( vh_mask ) ) //4
// {
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhashA );
memset( &vhashA[8<<2], 0, 32<<2 );
// }
// if ( !( hash0[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
// }
// if ( !( hash2[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
// }
// if ( !( hash2[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
// }
// if ( !( hash3[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// }
mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
memcpy(state, vhash, 32<<2 );
}
int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
// uint32_t *hash7 = &(hash[7<<2]);
// uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[ m ];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
hmq1725_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( ( (hash+(i<<3))[7] & mask ) == 0 )
{
if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
{
pdata[19] = n + i;
submit_solution( work, (hash+(i<<3)), mythr, i );
}
}
n += 4;
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif // HMQ1725_4WAY

17
algo/quark/hmq1725-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "hmq1725-gate.h"
bool register_hmq1725_algo( algo_gate_t* gate )
{
#if defined(HMQ1725_4WAY)
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

28
algo/quark/hmq1725-gate.h Normal file
View File

@@ -0,0 +1,28 @@
#ifndef HMQ1725_GATE_H__
#define HMQ1725_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
// #define HMQ1725_4WAY
#endif
bool register_hmq1725_algo( algo_gate_t* gate );
#if defined(HMQ1725_4WAY)
void hmq1725_4way_hash( void *state, const void *input );
int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif
#endif // HMQ1725_GATE_H__

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "hmq1725-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
@@ -298,10 +298,11 @@ extern void hmq1725hash(void *state, const void *input)
memcpy(state, hashA, 32);
}
int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[32] __attribute__((aligned(64)));
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -311,7 +312,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
for (int k = 0; k < 32; k++)
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
@@ -407,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
pdata[19] = n;
return 0;
}
/*
bool register_hmq1725_algo( algo_gate_t* gate )
{
init_hmq1725_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&scrypt_set_target;
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
return true;
};
*/

View File

@@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input )
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const uint32_t mask = 8;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
@@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
if ( hash0[0] & mask )
{
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
}
if ( hash1[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
}
if ( hash2[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
}
if ( hash3[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
}
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
if ( mm256_anybits0( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
@@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
// Final blend, directly to state, only need 32 bytes.
casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
quark_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
{
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, lane_hash, mythr, i );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_quark_ctx();
#endif

View File

@@ -173,7 +173,7 @@ void quark_hash(void *state, const void *input)
}
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
@@ -181,6 +181,7 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input )
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 32+3; // 4*8 + 3
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
if ( !( hash[7] & mask ) )
if ( fulltest( hash, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n;
work_set_target_ratio( work, hash );
pdata[19] = n;
submit_solution( work, hash, mythr, 0 );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
if ( !( (hash+8)[7] & mask ) )
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n+1;
work_set_target_ratio( work, hash+8 );
pdata[19] = n+1;
submit_solution( work, hash+8, mythr, 1 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate );
#if defined(DEEP_2WAY)
void deep_2way_hash( void *state, const void *input );
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_deep_2way_ctx();
#endif
void deep_hash( void *state, const void *input );
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_deep_ctx();
#endif

View File

@@ -72,7 +72,7 @@ void deep_hash(void *output, const void *input)
}
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
@@ -80,6 +80,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 };

View File

@@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input )
}
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 32+3; // 4*8 + 3
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
qubit_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
if ( !( hash[7] & mask ) )
if ( fulltest( hash, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n;
work_set_target_ratio( work, hash );
pdata[19] = n;
submit_solution( work, hash, mythr, 0 );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
if ( !( (hash+8)[7] & mask ) )
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
{
pdata[19] = n+1;
nonces[ num_found++ ] = n+1;
work_set_target_ratio( work, hash+8 );
submit_solution( work, hash+8, mythr, 1 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate );
#if defined(QUBIT_2WAY)
void qubit_2way_hash( void *state, const void *input );
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_qubit_2way_ctx();
#endif
void qubit_hash( void *state, const void *input );
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_qubit_ctx();
#endif

View File

@@ -83,8 +83,8 @@ void qubit_hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_qubit(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -92,6 +92,7 @@ int scanhash_qubit(int thr_id, struct work *work,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 };

View File

@@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input )
sha256_8way_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashA, 32 );
@@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input )
sha512_4way_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way( &ctx_ripemd, vhashA, 32 );
@@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input )
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, vhashA );
mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96,
output+128, output+160, output+192, output+224,
vhashA, 256 );
sha256_8way_close( &ctx_sha256, output );
}
int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 216; // 27*8
__m256i *noncev = (__m256i*)vdata + 27; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
@@ -100,8 +97,11 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
sha256_8way_init( &sha256_8w_mid );
sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
@@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( noncep+4, n+4 );
be32enc( noncep+5, n+5 );
be32enc( noncep+6, n+6 );
be32enc( noncep+7, n+7 );
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, lane_hash, mythr, i );
}
}
n += 8;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
}
#elif defined(LBRY_4WAY)
static __thread sha256_4way_context sha256_mid;
void lbry_4way_hash( void* output, const void* input )
{
sha256_4way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4way_context ctx_sha512;
ripemd160_4way_context ctx_ripemd;
uint32_t _ALIGN(64) vhashA[16<<2];
uint32_t _ALIGN(64) vhashB[16<<2];
uint32_t _ALIGN(64) vhashC[16<<2];
memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
// sha512 64 bit data, 64 byte output
mm256_reinterleave_4x64( vhashB, vhashA, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
mm256_reinterleave_4x32( vhashA, vhashB, 512 );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA, 32 );
ripemd160_4way_close( &ctx_ripemd, vhashB );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
ripemd160_4way_close( &ctx_ripemd, vhashC );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashB, 20 );
sha256_4way( &ctx_sha256, vhashC, 20 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
mm128_deinterleave_4x32( output, output+32, output+64, output+96,
vhashA, 256 );
}
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 108; // 27*4
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
sha256_4way_init( &sha256_mid );
sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
lbry_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
{
pdata[27] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n+=4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -4,9 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
// Overide multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
#if !defined(__SHA__)
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
@@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate );
void lbry_8way_hash( void *state, const void *input );
int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
/*
#elif defined(LBRY_4WAY)
void lbry_4way_hash( void *state, const void *input );
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
*/
#else
void lbry_hash( void *state, const void *input );
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input)
}
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27] - 1;
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t endiandata[32] __attribute__ ((aligned (64)));

View File

@@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
#endif /* HAVE_SCRYPT_6WAY */
extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
int throughput = scrypt_best_throughput();
int i;

View File

@@ -5,12 +5,11 @@
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE2__)
#define SHA256T_4WAY
#endif
#if !defined(__SHA__)
#if defined(__AVX2__)
#define SHA256T_8WAY
#elif defined(__SSE2__)
#define SHA256T_4WAY
#endif
#endif

View File

@@ -6,6 +6,12 @@
#if defined (__AVX2__)
union _m256_v16 {
uint16_t u16[16];
__m256i v256;
};
typedef union _m256_v16 m256_v16;
// imported from simd_iv.h
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,

View File

@@ -2,7 +2,11 @@
#include <string.h>
#include <stdint.h>
#include "skein-hash-4way.h"
#if defined(__SHA__)
#include <openssl/sha.h>
#else
#include "algo/sha/sha2-hash-4way.h"
#endif
#if defined (SKEIN_4WAY)
@@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input )
uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
skein512_4way_context ctx_skein;
#if defined(__SHA__)
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
SHA256_CTX ctx_sha256;
#else
sha256_4way_context ctx_sha256;
#endif
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash64 );
#if defined(__SHA__)
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, state );
mm128_dintrlv_4x32( state, state+32, state+64, state+96,
vhash32, 256 );
#endif
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8];
uint32_t *hash7 = &(hash[7<<2]);
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
int num_found = 0;
// data is 80 bytes, 20 u32 or 4 u64.
swab32_array( edata, pdata, 20 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skeinhash_4way( hash, vdata );
@@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -3,24 +3,21 @@
#include <stdint.h>
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__AVX2__)
#define SKEIN_4WAY
#endif
#endif
#if defined(SKEIN_4WAY)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void skeinhash( void *output, const void *input );
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input)
memcpy(state, hash, 32);
}
int scanhash_skein(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -31,6 +31,7 @@ int scanhash_skein(int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input )
}
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint64_t *edata = (uint64_t*)endiandata;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
swab32_array( endiandata, pdata, 20 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skein2hash( hash, vdata );
skein2hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
if ( hash7[ lane<<1 ] <= Htarg )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -10,10 +10,9 @@ int64_t skein2_get_max64 ()
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
#if defined (FOUR_WAY) && defined (__AVX2__)
#if defined (SKEIN2_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
four_way_not_tested();
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;

View File

@@ -10,11 +10,11 @@
#if defined(SKEIN2_4WAY)
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t* hashes_done );
uint64_t* hashes_done, struct thr_info *mythr );
#endif
void skein2hash( void *output, const void *input );
int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input)
}
int scanhash_skein2(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input)
}
int scanhash_axiom(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];

View File

@@ -13,26 +13,16 @@
#include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
union _poly_4way_context_overlay
{
skein512_4way_context skein;
shabal512_4way_context shabal;
hashState_echo echo;
luffa_2way_context luffa;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_4way_ctx_holder;
poly_4way_ctx_holder poly_4way_ctx;
void init_polytimos_4way_ctx()
{
skein512_4way_init( &poly_4way_ctx.skein );
shabal512_4way_init( &poly_4way_ctx.shabal );
init_echo( &poly_4way_ctx.echo, 512 );
luffa_2way_init( &poly_4way_ctx.luffa, 512 );
sph_fugue512_init( &poly_4way_ctx.fugue );
sph_gost512_init( &poly_4way_ctx.gost );
}
};
typedef union _poly_4way_context_overlay poly_4way_context_overlay;
void polytimos_4way_hash( void *output, const void *input )
{
@@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
poly_4way_context_overlay ctx;
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// Need to convert from 64 bit interleaved to 32 bit interleaved.
uint32_t vhash32[16*4];
mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
@@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input )
memcpy( output+96, hash3, 32 );
}
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done )
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
polytimos_4way_hash(hash, vdata);
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
} while ( ( n < max_nonce-4 ) && !(*restart));
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
#ifdef POLYTIMOS_4WAY
init_polytimos_4way_ctx();
gate->scanhash = (void*)&scanhash_polytimos_4way;
gate->hash = (void*)&polytimos_4way_hash;
#else

View File

@@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate );
#if defined(POLYTIMOS_4WAY)
void polytimos_4way_hash( void *state, const void *input );
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_polytimos_4way_ctx();
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void polytimos_hash( void *state, const void *input );
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_polytimos_ctx();
#endif

View File

@@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input)
memcpy(output, hashA, 32);
}
int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;

View File

@@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input )
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
@@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input )
}
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
@@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
be32enc( noncep, n );

View File

@@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate );
void veltor_4way_hash( void *state, const void *input );
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_veltor_4way_ctx();
@@ -24,7 +24,7 @@ void init_veltor_4way_ctx();
void veltor_hash( void *state, const void *input );
int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_veltor_ctx();

View File

@@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input)
memcpy(output, hashB, 32);
}
int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];

View File

@@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
@@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, state );
}
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )

View File

@@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate );
#if defined(X14_4WAY)
void x14_4way_hash( void *state, const void *input );
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_4way_ctx();
#endif
void x14hash( void *state, const void *input );
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_ctx();
#endif

View File

@@ -180,8 +180,8 @@ void x14hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_x14(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work,
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,

View File

@@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
@@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input )
}
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )

View File

@@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate );
#if defined(X15_4WAY)
void x15_4way_hash( void *state, const void *input );
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_4way_ctx();
#endif
void x15hash( void *state, const void *input );
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_ctx();
#endif

View File

@@ -186,8 +186,8 @@ void x15hash(void *output, const void *input)
memcpy(output, hashB, 32);
}
int scanhash_x15(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work,
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,

View File

@@ -32,8 +32,8 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
typedef struct {
union _x16r_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_echo echo;
@@ -50,16 +50,8 @@ typedef struct {
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
} x16r_4way_ctx_holder;
x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
// Cube needs one full init so fast reinits can be done in the hash loop.
void init_x16r_4way_ctx()
{
cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
};
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
void x16r_4way_hash( void* output, const void* input )
{
@@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input )
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x16r_4way_ctx_holder ctx;
x16r_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
if ( s_ntime == UINT32_MAX )
{
@@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input )
blake512_4way( &ctx.blake, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
@@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input )
bmw512_4way( &ctx.bmw, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
@@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input )
skein512_4way( &ctx.skein, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
@@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input )
jh512_4way( &ctx.jh, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
@@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input )
keccak512_4way( &ctx.keccak, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input )
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
@@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input )
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
@@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
@@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_whirlpool_close( &ctx.whirlpool, hash3 );
break;
case SHA_512:
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
size = 64;
@@ -304,15 +296,13 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
// if ( s_ntime != pdata[17] )
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
@@ -326,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16r_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
} while ( ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
// init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
init_x16r_ctx();
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
@@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
// init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
init_x16r_ctx();
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif

View File

@@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate );
#if defined(X16R_4WAY)
void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x16r_4way_ctx();
#endif
void x16r_hash( void *state, const void *input );
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x16r_ctx();
#endif

View File

@@ -33,7 +33,8 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
typedef struct {
union _x16r_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
@@ -55,19 +56,13 @@ typedef struct {
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
} x16r_ctx_holder;
x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
void init_x16r_ctx()
{
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
};
typedef union _x16r_context_overlay x16r_context_overlay;
void x16r_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
x16r_ctx_holder ctx;
x16r_context_overlay ctx;
void *in = (void*) input;
int size = 80;
@@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input )
(const BitSequence*)in, size );
break;
case CUBEHASH:
memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
@@ -196,13 +191,12 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
// This code is suspicious. s_ntime is saved after byteswapping pdata[17]
// but is tested vs unswapped pdata[17]. This should result in calling
// getAlgoString every pass, but that doesn't seem to be the case.
// It appears to be working correctly as is.
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);

View File

@@ -3,13 +3,9 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_4WAY)
printf("register x17 4way\n");
// init_x17_4way_ctx();
gate->scanhash = (void*)&scanhash_x17_4way;
gate->hash = (void*)&x17_4way_hash;
#else
printf("register x17 no 4way\n");
init_x17_ctx();
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17_hash;
#endif

View File

@@ -13,20 +13,14 @@ bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_4WAY)
void x17_4way_hash( void *state, const void *input );
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void init_x17_4way_ctx();
#endif
void x17_hash( void *state, const void *input );
int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x17_ctx();
#endif

View File

@@ -34,7 +34,8 @@
#include "algo/echo/sph_echo.h"
#endif
typedef struct {
union _x17_context_overlay
{
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
@@ -43,7 +44,7 @@ typedef struct {
sph_echo512_context echo;
#endif
hashState_luffa luffa;
cubehashParam cubehash;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
@@ -52,38 +53,14 @@ typedef struct {
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
} x17_ctx_holder;
x17_ctx_holder x17_ctx __attribute__ ((aligned (64)));
void init_x17_ctx()
{
#if defined(__AES__)
init_groestl( &x17_ctx.groestl, 64 );
init_echo( &x17_ctx.echo, 512 );
#else
sph_groestl512_init(&x17_ctx.groestl );
sph_echo512_init(&x17_ctx.echo);
#endif
init_luffa( &x17_ctx.luffa, 512 );
cubehashInit( &x17_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x17_ctx.shavite );
init_sd( &x17_ctx.simd, 512 );
sph_hamsi512_init( &x17_ctx.hamsi );
sph_fugue512_init( &x17_ctx.fugue );
sph_shabal512_init( &x17_ctx.shabal );
sph_whirlpool_init( &x17_ctx.whirlpool );
SHA512_Init( &x17_ctx.sha512 );
sph_haval256_5_init(&x17_ctx.haval);
};
typedef union _x17_context_overlay x17_context_overlay;
void x17_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (64)));
#define hashB hash+64
x17_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) );
x17_context_overlay ctx;
unsigned char hashbuf[128];
size_t hashptr;
@@ -115,9 +92,11 @@ void x17_hash(void *output, const void *input)
//---groestl----
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hash, 64 );
sph_groestl512_close( &ctx.groestl, hash );
#endif
@@ -142,50 +121,62 @@ void x17_hash(void *output, const void *input)
KEC_C;
//--- luffa7
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)hash, 64 );
// 9 Shavite
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
// 10 Simd
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
//11---echo---
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
#endif
// X13 algos
// 12 Hamsi
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash, 64 );
sph_hamsi512_close( &ctx.hamsi, hash );
// 13 Fugue
sph_fugue512_init( &ctx.fugue );
sph_fugue512(&ctx.fugue, hash, 64 );
sph_fugue512_close(&ctx.fugue, hash );
// X14 Shabal
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close( &ctx.shabal, hash );
// X15 Whirlpool
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash );
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, hash, 64 );
SHA512_Final( (unsigned char*)hash, &ctx.sha512 );
sph_haval256_5_init(&ctx.haval);
sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
sph_haval256_5_close( &ctx.haval, output );
}

View File

@@ -55,6 +55,7 @@ static void *alloc_region(yespower_region_t *region, size_t size)
base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (base != MAP_FAILED) {
base_size = new_size;
} else if (flags & MAP_HUGETLB) {
flags &= ~MAP_HUGETLB;
base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);

View File

@@ -7,16 +7,6 @@
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -msha -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-avx2-sha.exe
strip -s cpuminer
mv cpuminer cpuminer-avx2-sha
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -26,7 +16,6 @@ mv cpuminer cpuminer-avx512
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -36,7 +25,6 @@ mv cpuminer cpuminer-avx2
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -46,7 +34,6 @@ mv cpuminer cpuminer-aes-avx
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -56,7 +43,6 @@ mv cpuminer cpuminer-aes-sse42
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -66,7 +52,6 @@ mv cpuminer cpuminer-sse42
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -76,7 +61,6 @@ mv cpuminer cpuminer-ssse3
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
@@ -86,8 +70,7 @@ mv cpuminer cpuminer-sse2
make clean || echo done
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure --with-curl
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-zen.exe
@@ -96,13 +79,8 @@ mv cpuminer cpuminer-zen
make clean || echo done
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
#mv cpuminer.exe cpuminer-native.exe
strip -s cpuminer
#mv cpuminer cpuminer-native
#make clean || echo done

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.3.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.4.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.3.1'
PACKAGE_STRING='cpuminer-opt 3.9.3.1'
PACKAGE_VERSION='3.9.4'
PACKAGE_STRING='cpuminer-opt 3.9.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.3.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.3.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.4:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.3.1
cpuminer-opt configure 3.9.4
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.3.1, which was
It was created by cpuminer-opt $as_me 3.9.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.3.1'
VERSION='3.9.4'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.3.1, which was
This file was extended by cpuminer-opt $as_me 3.9.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.3.1
cpuminer-opt config.status 3.9.4
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.3.1])
AC_INIT([cpuminer-opt], [3.9.4])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1851,14 +1851,14 @@ static void *miner_thread( void *userdata )
{
#if AFFINITY_USES_UINT128
// Default affinity
if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 )
{
if ( opt_debug )
applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
thr_id, thr_id % num_cpus,
u128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
u128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ),
u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) );
affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) );
}
#else
if ( (opt_affinity == -1LL) && opt_n_threads > 1 )
@@ -2322,7 +2322,7 @@ bool jr2_stratum_handle_response( json_t *val )
static bool stratum_handle_response( char *buf )
{
json_t *val, *id_val;
json_t *val, *id_val, *res_val;
json_error_t err;
bool ret = false;
@@ -2332,7 +2332,9 @@ static bool stratum_handle_response( char *buf )
applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
goto out;
}
json_object_get( val, "result" );
res_val = json_object_get( val, "result" );
if ( !res_val ) { /* now what? */ }
id_val = json_object_get( val, "id" );
if ( !id_val || json_is_null(id_val) )
goto out;

View File

@@ -80,6 +80,7 @@
// the element size.
// - there is a subset of some functions for scalar data. They may have
// no prefix nor vec-size, just one size, the size of the data.
// - Some integer functions are also defined which use a similar notation.
//
// Function names follow this pattern:
//
@@ -89,7 +90,7 @@
// are some examples:
//
// u64: unsigned 64 bit integer function
// i128: signed 128 bit integer function
// i128: signed 128 bit integer function (rarely used)
// m128: 128 bit vector identifier
// mm128: 128 bit vector function
//
@@ -137,14 +138,32 @@
// improve high level code readability without the penalty of function
// overhead.
//
// A major restructuring is taking place shifting the focus from pointers
// to registers. Previously pointer casting used memory to provide transparency
// leaving it up to the compiler to manage everything and it does a very good
// job. The focus has shifted to register arguments for more control
// over the actual instructions assuming the data is in a register and the
// the compiler just needs to manage the registers.
//
// Rather than use pointers to provide type transparency
// specific instructions are used to access specific data as specific types.
// Previously pointers were cast and the compiler was left to find a way
// to get the data from wherever it happened to be to the correct registers.
//
// The utilities defined here make use features like register aliasing
// to optimize operations. Many operations have specialized versions as
// well as more generic versions. It is preferable to use a specialized
// version whenever possible a sthey can take advantage of certain
// optimizations not available to the generic version. Specically the generic
// version usually has a second argument used is some extra calculations.
//
///////////////////////////////////////////////////////
#include <inttypes.h>
#include <x86intrin.h>
#include <memory.h>
#include <stdlib.h>
#include <stdbool.h>
// byteswap.h doesn't exist on Windows, find alternative
//#include <byteswap.h>
// Various types and overlays
#include "simd-utils/simd-types.h"
@@ -157,6 +176,7 @@
// 64 bit vectors
#include "simd-utils/simd-mmx.h"
#include "simd-utils/intrlv-mmx.h"
#if defined(__SSE2__)
// 128 bit vectors

View File

@@ -384,6 +384,7 @@ static inline void mm256_intrlv_8x32( void *d, const void *s0,
// bit_len == 1024
}
// A couple of mining specifi functions.
// Interleave 80 bytes of 32 bit data for 8 lanes.
static inline void mm256_bswap_intrlv80_8x32( void *d, const void *s )
@@ -469,6 +470,20 @@ static inline void mm256_bswap_intrlv80_4x64( void *d, const void *s )
mm256_bswap_intrlv_4x64_128( d+256, casti_m128i( s, 4 ) );
}
// Blend 32 byte lanes of hash from 2 sources according to control mask.
// macro due to 256 bit value arg.
#define mm256_blend_hash_4x64( dst, a, b, mask ) \
do { \
dst[0] = _mm256_blendv_epi8( a[0], b[0], mask ); \
dst[1] = _mm256_blendv_epi8( a[1], b[1], mask ); \
dst[2] = _mm256_blendv_epi8( a[2], b[2], mask ); \
dst[3] = _mm256_blendv_epi8( a[3], b[3], mask ); \
dst[4] = _mm256_blendv_epi8( a[4], b[4], mask ); \
dst[5] = _mm256_blendv_epi8( a[5], b[5], mask ); \
dst[6] = _mm256_blendv_epi8( a[6], b[6], mask ); \
dst[7] = _mm256_blendv_epi8( a[7], b[7], mask ); \
} while(0)
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.

View File

@@ -103,6 +103,29 @@
#define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a )
#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
// Extract 4 u64 from 256 bit vector.
#define mm256_extr_4x64( a0, a1, a2, a3, src ) \
do { \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \
a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi64( hi, 0 ); \
a3 = _mm_extract_epi64( hi, 1 ); \
} while(0)
#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \
do { \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \
a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \
a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \
a4 = _mm_extract_epi32( hi, 0 ); \
a5 = _mm_extract_epi32( hi, 1 ); \
a6 = _mm_extract_epi32( hi, 2 ); \
a7 = _mm_extract_epi32( hi, 3 ); \
} while(0)
// input __m128i, returns __m256i
// To build a 256 bit vector from 2 128 bit vectors lo must be done first.
// lo alone leaves hi undefined, hi alone leaves lo unchanged.
@@ -111,10 +134,24 @@
#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 )
#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 )
// concatenate two 128 bit vectors into one 256 bit vector
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi )
// Horizontal vector testing
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm256_anybits0( a ) \
( (uint128_t)mm128_extr_hi128_256( a ) \
| (uint128_t)mm128_extr_lo128_256( a ) )
#define mm256_anybits1( a ) \
( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
| ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
// Parallel AES, for when x is expected to be in a 256 bit register.
#define mm256_aesenc_2x128( x ) \
mm256_concat_128( \

View File

@@ -3,9 +3,15 @@
///////////////////////////////////
//
// Integers up to 64 bits.
// Integers up to 128 bits.
//
// These utilities enhance support for integers up to 128 bits.
// All standard operations are supported on 128 bit integers except
// numeric constant representation and IO. 128 bit integers must be built
// and displayed as 2 64 bit halves, just like the old times.
//
// Some utilities are also provided for smaller integers, most notably
// bit rotation.
// MMX has no extract instruction for 32 bit elements so this:
// Lo is trivial, high is a simple shift.
@@ -17,7 +23,6 @@
#define u64_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 4-(n)) <<4 ) ) )
#define u64_extr_8( a, n ) ( (uint8_t) ( (a) >> ( ( 8-(n)) <<3 ) ) )
// Rotate bits in various sized integers.
#define u64_ror_64( x, c ) \
(uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
@@ -36,6 +41,9 @@
#define u8_rol_8( x, c ) \
(uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
// Endian byte swap
#define bswap_64( a ) __builtin_bswap64( a )
#define bswap_32( a ) __builtin_bswap32( a )
// 64 bit mem functions use integral sizes instead of bytes, data must
// be aligned to 64 bits. Mostly for scaled indexing convenience.
@@ -56,21 +64,20 @@ static inline void memset_64( uint64_t *dst, const uint64_t a, int n )
//
// No real need or use.
#define i128_neg1 ((uint128_t)(-1LL))
//#define u128_neg1 ((uint128_t)(-1))
// Extract specified 64 bit half of 128 bit integer.
// typecast should work for lo: (uint64_t)(x), test it!
// Extracting the low bits is a trivial cast.
// These specialized functions are optimized while providing a
// consistent interface.
#define u128_hi64( x ) ( (uint64_t)( (uint128_t)(x) >> 64 ) )
#define u128_lo64( x ) ( (uint64_t)( (uint128_t)(x) << 64 >> 64 ) )
// #define i128_lo64( x ) ((uint64_t)(x))
#define u128_lo64( x ) ( (uint64_t)(x) )
// Generic extract,
// Generic extract, don't use for extracting low bits, cast instead.
#define u128_extr_64( a, n ) ( (uint64_t)( (a) >> ( ( 2-(n)) <<6 ) ) )
#define u128_extr_32( a, n ) ( (uint32_t)( (a) >> ( ( 4-(n)) <<5 ) ) )
#define u128_extr_16( a, n ) ( (uint16_t)( (a) >> ( ( 8-(n)) <<4 ) ) )
#define u128_extr_8( a, n ) ( (uint8_t) ( (a) >> ( (16-(n)) <<3 ) ) )
// Not much need for this but it fills a gap.
#define u128_ror_128( x, c ) \
( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )

View File

@@ -111,8 +111,8 @@
#if defined(__SSSE3__)
// An SSE2 versin of this would be monstrous, shifting, masking and oring
// each byte individually.
// An SSE2 or MMX version of this would be monstrous, shifting, masking and
// oring each byte individually.
#define mm64_invert_8( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) );

View File

@@ -10,11 +10,21 @@
// SSE2 is generally required for full 128 bit support. Some functions
// are also optimized with SSSE3 or SSE4.1.
//
// Do not call _mm_extract directly, it isn't supported in SSE2.
// Use mm128_extr instead, it will select the appropriate implementation.
//
// 128 bit operations are enhanced with uint128 which adds 128 bit integer
// support for arithmetic and other operations. Casting to uint128_t is not
// free, it requires a move from mmx to gpr but is often the only way or
// the more efficient way for certain operations.
// Compile time constant initializers are type agnostic and can have
// a pointer handle of almost any type. All arguments must be scalar constants.
// up to 64 bits. These iniitializers should only be used at compile time
// to initialize vector arrays. All data reside in memory.
//
// These are of limited use, it is often simpler to use uint64_t arrays
// and cast as required.
#define mm128_const_64( x1, x0 ) {{ x1, x0 }}
#define mm128_const1_64( x ) {{ x, x }}
@@ -80,6 +90,28 @@
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
// Use uint128_t for most arithmetic, bit shift, comparison operations
// spanning all 128 bits. Some extractions are also more efficient
// casting __m128i as uint128_t and usingstandard operators.
// This isn't cheap, not suitable for bulk usage.
#define mm128_extr_4x32( a0, a1, a2, a3, src ) \
do { \
a0 = _mm_extract_epi32( src, 0 ); \
a1 = _mm_extract_epi32( src, 1 ); \
a1 = _mm_extract_epi32( src, 2 ); \
a3 = _mm_extract_epi32( src, 3 ); \
} while(0)
// Horizontal vector testing
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm128_anybits0( a ) (uint128_t)(a)
#define mm128_anybits1( a ) (((uint128_t)(a))+1)
#define mm128_allbits0( a ) ( !mm128_anybits1(a) )
#define mm128_allbits1( a ) ( !mm128_anybits0(a) )
//
// Vector pointer cast

View File

@@ -93,6 +93,7 @@
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
// Compiler check for __int128 support
// Configure also has a test for int128.
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#define GCC_INT128 1
#endif
@@ -386,13 +387,3 @@ typedef union _regarray_v256 regarray_v256;
#define u8_1e u8_._1e
#define u8_1f u8_._1f
// This is in use by, coincidentally, simd hash.
union _m256_v16 {
uint16_t u16[16];
__m256i v256;
};
typedef union _m256_v16 m256_v16;

View File

@@ -34,7 +34,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=znver1 -DRYZEN_ -Wall" ./configure $CONFIGURE_ARGS
CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
make -j 16
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-zen.exe