Compare commits

..

3 Commits

Author SHA1 Message Date
Jay D Dee
d6e8d7a46e v3.9.4 2019-06-18 13:15:45 -04:00
Jay D Dee
71d6b97ee8 v3.9.3.1 2019-06-13 21:15:58 -04:00
Jay D Dee
b2331375a3 v3.9.2.5 2019-06-13 11:20:27 -04:00
126 changed files with 6175 additions and 5098 deletions

View File

@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
compiler version, to CFLAGS:
"-march=native" or "-march=znver1" or "-msha".
Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
to override multiway AVX2 on algos with sha256, and use SHA instead.
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer
Static builds should only considered in a homogeneous HW and SW environment.

View File

@@ -131,6 +131,7 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2h-4way.c \
algo/lyra2/allium-4way.c \
algo/lyra2/allium.c \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
@@ -147,6 +148,9 @@ cpuminer_SOURCES = \
algo/quark/anime-gate.c \
algo/quark/anime.c \
algo/quark/anime-4way.c \
algo/quark/hmq1725-gate.c \
algo/quark/hmq1725-4way.c \
algo/quark/hmq1725.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
@@ -257,7 +261,6 @@ cpuminer_SOURCES = \
algo/x17/xevan-gate.c \
algo/x17/xevan.c \
algo/x17/xevan-4way.c \
algo/x17/hmq1725.c \
algo/x17/sonoa-gate.c \
algo/x17/sonoa-4way.c \
algo/x17/sonoa.c \

View File

@@ -29,7 +29,7 @@ cpuminer-sse2.exe "-msse2" Core2, Nehalem
cpuminer-aes-sse42.exe "-march=westmere" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
cpuminer-zen "-march=znver1 -DRYZEN_" Ryzen
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
If you like this software feel free to donate:

View File

@@ -38,6 +38,25 @@ supported.
Change Log
----------
v3.9.4
Faster AVX2 for lyra2v3, quark, anime.
Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
Faster skein2 with 4way AVX2 enabled.
Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
Ongoing restructuring.
v3.9.3.1
Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
Fixed x16r algo 25% invalid share reject rate. The bug may have also
affected other algos.
v3.9.2.5
Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
More restructuring.
v3.9.2.4
Yet another affinity fix. Hopefully the last one.

View File

@@ -2,8 +2,7 @@
#include <stdbool.h>
#include <stdint.h>
#include "miner.h"
#include "avxdefs.h"
#include "interleave.h"
#include "simd-utils.h"
/////////////////////////////
////

View File

@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
void free_memory(const argon2_context *context, uint8_t *memory,
size_t num, size_t size) {
size_t memory_size = num*size;
clear_internal_memory(memory, memory_size);
// clear_internal_memory(memory, memory_size);
if (context->free_cbk) {
(context->free_cbk)(memory, memory_size);
} else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
int FLAG_clear_internal_memory = 0;
void clear_internal_memory(void *v, size_t n) {
if (FLAG_clear_internal_memory && v) {
secure_wipe_memory(v, n);
// secure_wipe_memory(v, n);
}
}
@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
context->pwdlen);
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
secure_wipe_memory(context->pwd, context->pwdlen);
// secure_wipe_memory(context->pwd, context->pwdlen);
context->pwdlen = 0;
}
}
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
context->secretlen);
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
secure_wipe_memory(context->secret, context->secretlen);
// secure_wipe_memory(context->secret, context->secretlen);
context->secretlen = 0;
}
}

View File

@@ -45,7 +45,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_blake256 256

View File

@@ -16,7 +16,7 @@
#if defined(__SSE4_2__)
#include "avxdefs.h"
#include "simd-utils.h"
#include <stddef.h>
#include <stdint.h>

View File

@@ -43,7 +43,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_bmw256 256

View File

@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
}
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
}
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =
void bmw256_8way_init( bmw256_8way_context *ctx )
{
ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
ctx->H[10] = _mm256_set1_epi32( IV256[10] );
ctx->H[11] = _mm256_set1_epi32( IV256[11] );
ctx->H[12] = _mm256_set1_epi32( IV256[12] );
ctx->H[13] = _mm256_set1_epi32( IV256[13] );
ctx->H[14] = _mm256_set1_epi32( IV256[14] );
ctx->H[15] = _mm256_set1_epi32( IV256[15] );
ctx->ptr = 0;
ctx->bit_count = 0;
@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
ptr = ctx->ptr;
h1 = ctx->H;
h2 = htmp;
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
vdata = vdata + (clen>>3);
memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
vdata = vdata + (clen>>2);
len -= clen;
ptr += clen;
if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
}
}
ctx->ptr = ptr;
if ( h1 != ctx->H )
memcpy_256( ctx->H, h1, 16 );
}
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
__m256i *buf;
__m256i h1[16], h2[16], *h;
size_t ptr, u, v;
// unsigned z;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
ptr += 8;
buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
ptr += 4;
h = ctx->H;
if ( ptr > (buf_size - 8) )
if ( ptr > (buf_size - 4) )
{
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small_8way( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
buf[ (buf_size - 4) >> 2 ] = m256_zero;
compress_small_8way( buf, h, h2 );
for ( u = 0; u < 16; u ++ )

View File

@@ -4,7 +4,7 @@
#if defined(__AVX2__)
#include <stdint.h>
#include "avxdefs.h"
#include "simd-utils.h"
// 2x128, 2 way parallel SSE2

View File

@@ -13,7 +13,7 @@
#include <stdbool.h>
#include <unistd.h>
#include <memory.h>
#include "avxdefs.h"
#include "simd-utils.h"
#include <stdio.h>
// The result of hashing 10 rounds of initial data which is params and

View File

@@ -12,7 +12,7 @@
#include <memory.h>
#include "hash-groestl.h"
#include "miner.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifndef NO_AES_NI

View File

@@ -9,7 +9,7 @@
#include <memory.h>
#include "hash-groestl256.h"
#include "miner.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifndef NO_AES_NI

View File

@@ -40,7 +40,7 @@
#if defined (__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -69,7 +69,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_haval256_5 256

View File

@@ -156,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
bool register_hodl_algo( algo_gate_t* gate )
{
#if defined(__AES__)
#if !defined(__AES__)
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif

View File

@@ -44,7 +44,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_jh256 256

View File

@@ -44,7 +44,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_keccak256 256

View File

@@ -24,7 +24,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )

View File

@@ -24,7 +24,7 @@
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
#include "simd-utils.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224

View File

@@ -20,7 +20,7 @@
#include <string.h>
#include <emmintrin.h>
#include "avxdefs.h"
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#define MULT2(a0,a1) do \

View File

@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,26 +68,23 @@ void allium_4way_hash( void *state, const void *input )
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -94,7 +92,6 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -106,13 +103,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
@@ -124,7 +115,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
if ( fulltest( hash+(lane<<3), ptarget ) )
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr, lane );

View File

@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v3_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV3_4WAY)
#if defined (LYRA2REV3_8WAY)
init_lyra2rev3_8way_ctx();;
#elif defined (LYRA2REV3_4WAY)
init_lyra2rev3_4way_ctx();;
#else
init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()
bool register_lyra2rev3_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV3_4WAY)
#if defined (LYRA2REV3_8WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_8way;
gate->hash = (void*)&lyra2rev3_8way_hash;
#elif defined (LYRA2REV3_4WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_4way;
gate->hash = (void*)&lyra2rev3_4way_hash;
#else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_phi2_algo( algo_gate_t* gate )
{
init_phi2_ctx();
// init_phi2_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
gate->build_extraheader = (void*)&phi2_build_extraheader;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
#if defined(PHI2_4WAY)
gate->scanhash = (void*)&scanhash_phi2_4way;
#else
init_phi2_ctx();
gate->scanhash = (void*)&scanhash_phi2;
#endif
return true;
}

View File

@@ -6,14 +6,24 @@
#include "lyra2.h"
#if defined(__AVX2__)
#define LYRA2REV3_8WAY
#endif
#if defined(__SSE2__)
#define LYRA2REV3_4WAY
#endif
extern __thread uint64_t* l2v3_wholeMatrix;
bool register_lyra2rev3_algo( algo_gate_t* gate );
#if defined(LYRA2REV3_8WAY)
#if defined(LYRA2REV3_4WAY)
void lyra2rev3_8way_hash( void *state, const void *input );
int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev3_8way_ctx();
#elif defined(LYRA2REV3_4WAY)
void lyra2rev3_4way_hash( void *state, const void *input );
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -140,15 +150,29 @@ bool init_allium_ctx();
/////////////////////////////////////////
#if defined(__AVX2__) && defined(__AES__)
// #define PHI2_4WAY
#endif
bool phi2_has_roots;
bool register_phi2_algo( algo_gate_t* gate );
#if defined(PHI2_4WAY)
void phi2_hash_4way( void *state, const void *input );
int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void init_phi2_ctx();
#else
void phi2_hash( void *state, const void *input );
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_phi2_ctx();
#endif
#endif // LYRA2_GATE_H__

View File

@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__SSE4_2__)
#elif defined(__SSE2__)
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset( wholeMatrix, 0, i );

View File

@@ -36,17 +36,16 @@ void lyra2h_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
32, 16, 16, 16 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,49 +53,36 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep= vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 20; i++ )
be32enc( &edata[i], pdata[i] );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2h_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( &edata[19], n );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -6,7 +6,7 @@
#include "algo/keccak/sph_keccak.h"
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif

View File

@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,11 +78,10 @@ void lyra2rev2_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -86,49 +89,44 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -1,12 +1,138 @@
#include "lyra2-gate.h"
#include <memory.h>
#if defined (LYRA2REV3_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined (LYRA2REV3_8WAY)
typedef struct {
blake256_8way_context blake;
cubehashParam cube;
bmw256_8way_context bmw;
} lyra2v3_8way_ctx_holder;
static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
bool init_lyra2rev3_8way_ctx()
{
blake256_8way_init( &l2v3_8way_ctx.blake );
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
bmw256_8way_init( &l2v3_8way_ctx.bmw );
return true;
}
void lyra2rev3_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t hash4[8] __attribute__ ((aligned (32)));
uint32_t hash5[8] __attribute__ ((aligned (32)));
uint32_t hash6[8] __attribute__ ((aligned (32)));
uint32_t hash7[8] __attribute__ ((aligned (32)));
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
blake256_8way( &ctx.blake, input, 80 );
blake256_8way_close( &ctx.blake, vhash );
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
mm256_intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap_intrlv80_8x32( vdata, pdata );
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
lyra2rev3_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined (LYRA2REV3_4WAY)
typedef struct {
blake256_4way_context blake;
cubehashParam cube;
@@ -35,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,10 +181,9 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -66,7 +191,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
@@ -80,15 +204,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
// Need big endian data
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -99,16 +215,14 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -36,17 +36,12 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,7 +49,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -66,13 +60,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2z_4way_midstate( vdata );
do {
@@ -82,16 +70,11 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, i );
else
applog( LOG_WARNING, "Failed to submit share." );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
@@ -136,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -163,7 +146,6 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -175,13 +157,7 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
@@ -191,7 +167,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_solution( work, hash+(i<<3), mythr, i );

View File

@@ -3,7 +3,7 @@
#include "lyra2-gate.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z_matrix;

View File

@@ -1,7 +1,7 @@
#include <memory.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z330_wholeMatrix;
@@ -30,14 +30,17 @@ int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
if (opt_benchmark)
ptarget[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
do
{
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
{
work_set_target_ratio(work, hash);
pdata[19] = nonce;

233
algo/lyra2/phi2-4way.c Normal file
View File

@@ -0,0 +1,233 @@
/**
* Phi-2 algo Implementation
*/
#include "lyra2-gate.h"
#if defined(PHI2_4WAY)
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
cubehashParam cube;
jh512_4way_context jh;
hashState_echo echo;
// hashState_echo echo2;
sph_gost512_context gost;
skein512_4way_context skein;
} phi2_ctx_holder;
/*
phi2_ctx_holder phi2_ctx;
void init_phi2_ctx()
{
cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
sph_jh512_init(&phi2_ctx.jh);
init_echo( &phi2_ctx.echo1, 512 );
init_echo( &phi2_ctx.echo2, 512 );
sph_gost512_init(&phi2_ctx.gost);
sph_skein512_init(&phi2_ctx.skein);
};
*/
void phi2_hash_4way( void *state, const void *input )
{
uint32_t hash[4][16] __attribute__ ((aligned (64)));
uint32_t hashA[4][16] __attribute__ ((aligned (64)));
uint32_t hashB[4][16] __attribute__ ((aligned (64)));
uint32_t vhash[4*16] __attribute__ ((aligned (64)));
// unsigned char _ALIGN(128) hash[64];
// unsigned char _ALIGN(128) hashA[64];
// unsigned char _ALIGN(128) hashB[64];
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
// memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
phi2_has_roots ? 144 : 80 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
phi2_has_roots ? 144 : 80 );
LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
mm256_intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
mm256_dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
if ( hash[0][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[0] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
(const BitSequence *)hash[0], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
(const BitSequence *)hash[0], 512 );
}
if ( hash[1][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[1] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
(const BitSequence *)hash[1], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
(const BitSequence *)hash[1], 512 );
}
if ( hash[2][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[2] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
(const BitSequence *)hash[2], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
(const BitSequence *)hash[2], 512 );
}
if ( hash[3][0] & 1 )
{
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
sph_gost512_close( &ctx.gost, (void*)hash[3] );
}
else
{
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
(const BitSequence *)hash[3], 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
(const BitSequence *)hash[3], 512 );
}
mm256_intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
for (int i=0; i<4; i++)
{
( (uint64_t*)vhash )[i] ^= ( (uint64_t*)vhash )[i+4];
( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
}
// for ( int i = 0; i < 4; i++ )
// casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i ),
// casti_m256i( vhash, i+4 ) );
memcpy( state, vhash, 128 );
}
int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) edata[36];
uint32_t vdata[4][36] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
// Data is not interleaved, but hash is.
// any non-zero data at index 20 or above sets roots true.
// Split up the operations, bswap first, then set roots.
phi2_has_roots = false;
for ( int i=0; i < 36; i++ )
{
be32enc(&edata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
/*
casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
mm128_anybits1( casti_m128i( vdata[0], 8 ) );
*/
memcpy( vdata[0], edata, 144 );
memcpy( vdata[1], edata, 144 );
memcpy( vdata[2], edata, 144 );
memcpy( vdata[3], edata, 144 );
do {
be32enc( &vdata[0][19], n );
be32enc( &vdata[1][19], n+1 );
be32enc( &vdata[2][19], n+2 );
be32enc( &vdata[3][19], n+3 );
phi2_hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] < Htarg )
{
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif // PHI2_4WAY

View File

@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
unsigned char _ALIGN(128) hashA[64];
unsigned char _ALIGN(128) hashB[64];
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
sph_jh512_close( &ctx.jh, (void*)hash );
if ( hash[0] & 1 )
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
sph_gost512_close( &ctx.gost, (void*)hash );
}
else
{
else
{
#if defined(__AES__)
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo1, (const void*)hash, 64 );
sph_echo512_close( &ctx.echo1, (void*)hash );

View File

@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
blake2b_IV[5], blake2b_IV[4] );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes remaining bytes
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
const int len_m128i = len / 16;
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* in = (__m128i*)rowIn;
__m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
__m128i* state = (__m128i*)State;
// For the last round in this function not optimized for AVX
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
// uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
// uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
// uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out[4] = _mm_xor_si128( state[4], in[4] );
out[5] = _mm_xor_si128( state[5], in[5] );
__m128i t0, t1;
t0 = _mm_srli_si128( state[0], 8 );
t1 = _mm_srli_si128( state[1], 8 );
inout[0] = _mm_xor_si128( inout[0],
_mm_or_si128( _mm_slli_si128( state[0], 8 ),
_mm_srli_si128( state[5], 8 ) ) );
inout[1] = _mm_xor_si128( inout[1],
_mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
t0 = _mm_srli_si128( state[2], 8 );
inout[2] = _mm_xor_si128( inout[2],
_mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
t1 = _mm_srli_si128( state[3], 8 );
inout[3] = _mm_xor_si128( inout[3],
_mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
t0 = _mm_srli_si128( state[4], 8 );
inout[4] = _mm_xor_si128( inout[4],
_mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
inout[5] = _mm_xor_si128( inout[5],
_mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
/*
ptrWordInOut[0] ^= State[11];
ptrWordInOut[1] ^= State[0];
ptrWordInOut[2] ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
*/
inout += BLOCK_LEN_M128I;
in += BLOCK_LEN_M128I;
out -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined(__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)rowIn;

View File

@@ -23,7 +23,7 @@
#define SPONGE_H_
#include <stdint.h>
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
a = _mm256_add_epi64( a, b ); \

View File

@@ -144,8 +144,8 @@ void init_m7m_ctx()
#define NM7M 5
#define SW_DIVS 5
#define M7_MIDSTATE_LEN 76
int scanhash_m7m_hash( int thr_id, struct work* work,
uint64_t max_nonce, unsigned long *hashes_done )
int scanhash_m7m_hash( int thr_id, struct work* work, uint64_t max_nonce,
unsigned long *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
uint32_t hash[8] __attribute__((aligned(64)));
uint8_t bhash[7][64] __attribute__((aligned(64)));
uint32_t n = pdata[19] - 1;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t usw_, mpzscale;
const uint32_t first_nonce = pdata[19];
char data_str[161], hash_str[65], target_str[65];

View File

@@ -445,7 +445,7 @@ void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const i
}
int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -454,6 +454,8 @@ int scanhash_pluck(int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
volatile uint8_t *restart = &(work_restart[thr_id].restart);
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0ffff;

View File

@@ -48,8 +48,8 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
const uint32_t mask = 8;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
int i;
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
@@ -62,27 +62,44 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
if ( hash0[0] & mask )
{
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
}
if ( hash1[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
}
if ( hash2[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
}
if ( hash3[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
if ( mm256_anybits0( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
@@ -91,7 +108,8 @@ void anime_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +117,20 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,33 +142,35 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
if ( mm256_anybits1( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
mm256_dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
@@ -165,10 +189,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
0
};
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
@@ -177,30 +198,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
anime_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
#if defined(ANIME_4WAY)
void anime_4way_hash( void *state, const void *input );
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_anime_4way_ctx();
#endif
void anime_hash( void *state, const void *input );
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_anime_ctx();
#endif

View File

@@ -46,20 +46,6 @@ void init_anime_ctx()
void anime_hash( void *state, const void *input )
{
unsigned char hash[128] __attribute__ ((aligned (32)));
/*
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
*/
uint32_t mask = 8;
anime_ctx_holder ctx;
memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -134,7 +120,7 @@ void anime_hash( void *state, const void *input )
}
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,

618
algo/quark/hmq1725-4way.c Normal file
View File

@@ -0,0 +1,618 @@
#include "hmq1725-gate.h"
#if defined(HMQ1725_4WAY)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
union _hmq1725_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
};
typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
extern void hmq1725_4way_hash(void *state, const void *input)
{
// why so big? only really need 8, haval thing uses 16.
uint32_t hash0 [32] __attribute__ ((aligned (64)));
uint32_t hash1 [32] __attribute__ ((aligned (64)));
uint32_t hash2 [32] __attribute__ ((aligned (64)));
uint32_t hash3 [32] __attribute__ ((aligned (64)));
uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
const __m256i vmask = _mm256_set1_epi64x( 24 );
const uint32_t mask = 24;
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// first fork, A is groestl serial, B is skein parallel.
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// A
// if ( hash0[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
// }
// if ( hash1[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
// }
// if ( hash2[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
// }
// if ( hash3[0] & mask )
// {
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// B
// if ( mm256_any_clr_256( vh_mask ) )
// {
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// second fork, A = blake parallel, B= bmw parallel.
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( mm256_any_set_256( vh_mask ) )
// {
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
// }
// if ( mm256_any_clr_256( vh_mask ) )
// {
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
(const BitSequence *)hash0, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
(const BitSequence *)hash1, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
(const BitSequence *)hash2, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
(const BitSequence *)hash3, 64 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// A= keccak parallel, B= jh parallel
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( mm256_any_set_256( vh_mask ) )
// {
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
// }
// if ( mm256_any_clr_256( vh_mask ) )
// {
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512 ( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// A is whirlpool serial, B is haval parallel.
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// A
// if ( hash0[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
// }
// if ( hash1[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
// }
// if ( hash2[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
// }
// if ( hash3[0] & mask )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// B
// if ( mm256_any_clr_256( vh_mask ) )
// {
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhashB );
memset( &vhashB[8<<2], 0, 32<<2);
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// shavite & luffa, both serial, select individually.
if ( hash0[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 ); //
sph_shavite512_close( &ctx.shavite, hash0 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
(const BitSequence *)hash0, 64 );
}
if ( hash1[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 ); //
sph_shavite512_close( &ctx.shavite, hash1 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
(const BitSequence *)hash1, 64 );
}
if ( hash2[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 ); //
sph_shavite512_close( &ctx.shavite, hash2 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
(const BitSequence *)hash2, 64 );
}
if ( hash3[0] & mask )
{
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 ); //
sph_shavite512_close( &ctx.shavite, hash3 ); //8
}
else
{
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
(const BitSequence *)hash3, 64 );
}
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// A echo, B sd both serial
if ( hash0[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
if ( hash1[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
if ( hash2[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
if ( hash3[0] & mask ) //4
{
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
else
{
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// A = fugue serial, B = sha512 prarallel
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
// if ( hash0[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
// }
// if ( hash1[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
// }
// if ( hash2[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
// }
// if ( hash3[0] & mask )
// {
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// }
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
// if ( mm256_any_clr_256( vh_mask ) )
// {
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhashB );
// }
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
// A = haval parallel, B = Whirlpool serial
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
m256_zero );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// if ( mm256_any_set_256( vh_mask ) ) //4
// {
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhashA );
memset( &vhashA[8<<2], 0, 32<<2 );
// }
// if ( !( hash0[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
// }
// if ( !( hash2[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
// }
// if ( !( hash2[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
// }
// if ( !( hash3[0] & mask ) )
// {
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// }
mm256_intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
memcpy(state, vhash, 32<<2 );
}
int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
// uint32_t *hash7 = &(hash[7<<2]);
// uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[ m ];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
hmq1725_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( ( (hash+(i<<3))[7] & mask ) == 0 )
{
if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
{
pdata[19] = n + i;
submit_solution( work, (hash+(i<<3)), mythr, i );
}
}
n += 4;
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif // HMQ1725_4WAY

17
algo/quark/hmq1725-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "hmq1725-gate.h"
bool register_hmq1725_algo( algo_gate_t* gate )
{
#if defined(HMQ1725_4WAY)
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->set_target = (void*)&scrypt_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

28
algo/quark/hmq1725-gate.h Normal file
View File

@@ -0,0 +1,28 @@
#ifndef HMQ1725_GATE_H__
#define HMQ1725_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
// #define HMQ1725_4WAY
#endif
bool register_hmq1725_algo( algo_gate_t* gate );
#if defined(HMQ1725_4WAY)
void hmq1725_4way_hash( void *state, const void *input );
int scanhash_hmq1725_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif
#endif // HMQ1725_GATE_H__

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "hmq1725-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
@@ -298,19 +298,22 @@ extern void hmq1725hash(void *state, const void *input)
memcpy(state, hashA, 32);
}
int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
uint64_t *hashes_done )
int scanhash_hmq1725( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[32] __attribute__((aligned(64)));
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
for (int k = 0; k < 32; k++)
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
@@ -406,14 +409,14 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
pdata[19] = n;
return 0;
}
/*
bool register_hmq1725_algo( algo_gate_t* gate )
{
init_hmq1725_ctx();
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&scrypt_set_target;
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
return true;
};
*/

View File

@@ -48,9 +48,10 @@ void quark_4way_hash( void *state, const void *input )
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const uint32_t mask = 8;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
@@ -62,27 +63,44 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
if ( hash0[0] & mask )
{
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
}
if ( hash1[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
}
if ( hash2[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
}
if ( hash3[0] & mask )
{
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
}
mm256_intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
if ( mm256_anybits0( vh_mask ) )
{
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
@@ -91,7 +109,8 @@ void quark_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
@@ -99,16 +118,21 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
@@ -120,63 +144,65 @@ void quark_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
if ( mm256_anybits1( vh_mask ) )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
}
if ( mm256_anybits0( vh_mask ) )
{
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
}
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
// Final blend, directly to state, only need 32 bytes.
casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
quark_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_solution( work, lane_hash, mythr, i );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_quark_ctx();
#endif

View File

@@ -173,16 +173,17 @@ void quark_hash(void *state, const void *input)
}
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
do {
pdata[19] = ++n;

View File

@@ -64,7 +64,7 @@ void deep_2way_hash( void *output, const void *input )
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -73,17 +73,17 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 32+3; // 4*8 + 3
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -102,23 +102,24 @@ int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
if ( !( hash[7] & mask ) )
if ( fulltest( hash, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n;
work_set_target_ratio( work, hash );
pdata[19] = n;
submit_solution( work, hash, mythr, 0 );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
if ( !( (hash+8)[7] & mask ) )
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n+1;
work_set_target_ratio( work, hash+8 );
pdata[19] = n+1;
submit_solution( work, hash+8, mythr, 1 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_deep_algo( algo_gate_t* gate );
#if defined(DEEP_2WAY)
void deep_2way_hash( void *state, const void *input );
int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_deep_2way_ctx();
#endif
void deep_hash( void *state, const void *input );
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_deep_ctx();
#endif

View File

@@ -72,14 +72,15 @@ void deep_hash(void *output, const void *input)
}
int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 };

View File

@@ -70,7 +70,7 @@ void qubit_2way_hash( void *output, const void *input )
}
int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -79,17 +79,17 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 32+3; // 4*8 + 3
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
@@ -107,25 +107,24 @@ int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
qubit_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
if ( !( hash[7] & mask ) )
if ( fulltest( hash, ptarget) && !opt_benchmark )
{
nonces[ num_found++ ] = n;
work_set_target_ratio( work, hash );
pdata[19] = n;
submit_solution( work, hash, mythr, 0 );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
if ( !( (hash+8)[7] & mask ) )
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
{
pdata[19] = n+1;
nonces[ num_found++ ] = n+1;
work_set_target_ratio( work, hash+8 );
submit_solution( work, hash+8, mythr, 1 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -13,19 +13,15 @@ bool register_qubit_algo( algo_gate_t* gate );
#if defined(QUBIT_2WAY)
void qubit_2way_hash( void *state, const void *input );
int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_qubit_2way_ctx();
#endif
void qubit_hash( void *state, const void *input );
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_qubit_ctx();
#endif

View File

@@ -83,15 +83,16 @@ void qubit_hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_qubit(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 };

View File

@@ -40,9 +40,9 @@ void lbry_8way_hash( void* output, const void* input )
sha256_8way_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
mm256_dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
mm256_intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
mm256_intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashA, 32 );
@@ -53,9 +53,9 @@ void lbry_8way_hash( void* output, const void* input )
sha512_4way_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
mm256_dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
mm256_dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
mm256_intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way( &ctx_ripemd, vhashA, 32 );
@@ -72,27 +72,24 @@ void lbry_8way_hash( void* output, const void* input )
sha256_8way_init( &ctx_sha256 );
sha256_8way( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, vhashA );
mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96,
output+128, output+160, output+192, output+224,
vhashA, 256 );
sha256_8way_close( &ctx_sha256, output );
}
int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 216; // 27*8
__m256i *noncev = (__m256i*)vdata + 27; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
@@ -100,9 +97,12 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m256i( edata, 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
casti_m256i( edata, 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
mm256_intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
sha256_8way_init( &sha256_8w_mid );
sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
@@ -111,136 +111,26 @@ int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( noncep+4, n+4 );
be32enc( noncep+5, n+5 );
be32enc( noncep+6, n+6 );
be32enc( noncep+7, n+7 );
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
lbry_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
pdata[27] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[27] = n + i;
submit_solution( work, lane_hash, mythr, i );
}
}
n+=8;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
}
#elif defined(LBRY_4WAY)
static __thread sha256_4way_context sha256_mid;
void lbry_4way_hash( void* output, const void* input )
{
sha256_4way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4way_context ctx_sha512;
ripemd160_4way_context ctx_ripemd;
uint32_t _ALIGN(64) vhashA[16<<2];
uint32_t _ALIGN(64) vhashB[16<<2];
uint32_t _ALIGN(64) vhashC[16<<2];
memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
// sha512 64 bit data, 64 byte output
mm256_reinterleave_4x64( vhashB, vhashA, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
mm256_reinterleave_4x32( vhashA, vhashB, 512 );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA, 32 );
ripemd160_4way_close( &ctx_ripemd, vhashB );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
ripemd160_4way_close( &ctx_ripemd, vhashC );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashB, 20 );
sha256_4way( &ctx_sha256, vhashC, 20 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
mm128_deinterleave_4x32( output, output+32, output+64, output+96,
vhashA, 256 );
}
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 108; // 27*4
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
sha256_4way_init( &sha256_mid );
sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
lbry_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
{
pdata[27] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n+=4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -4,12 +4,10 @@
#include "algo-gate-api.h"
#include <stdint.h>
// Overide multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
#if defined(__AVX2__)
#if !defined(__SHA__)
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
#endif
#endif
#define LBRY_NTIME_INDEX 25
@@ -24,17 +22,18 @@ bool register_lbry_algo( algo_gate_t* gate );
void lbry_8way_hash( void *state, const void *input );
int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
/*
#elif defined(LBRY_4WAY)
void lbry_4way_hash( void *state, const void *input );
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
*/
#else
void lbry_hash( void *state, const void *input );
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -48,13 +48,14 @@ void lbry_hash(void* output, const void* input)
}
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27] - 1;
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t endiandata[32] __attribute__ ((aligned (64)));

View File

@@ -6,7 +6,7 @@
#if defined(__SSE4_2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct
{

View File

@@ -696,7 +696,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
#endif /* HAVE_SCRYPT_6WAY */
extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -704,6 +704,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t midstate[8];
uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
int throughput = scrypt_best_throughput();
int i;

View File

@@ -42,7 +42,7 @@
#include <stddef.h>
#include "sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__SSE2__)
//#if defined(__SSE4_2__)

View File

@@ -1,3 +1,4 @@
#if 0
#include <stddef.h>
#include <string.h>
@@ -65,7 +66,7 @@ static const uint32_t K256[64] =
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
#define BSG2_0z(x) ( ror_32(x,2) ^ ror_32(x,13) ^ ((x)>>22) )
#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
@@ -75,7 +76,7 @@ static const uint32_t K256[64] =
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
#define BSG2_1z(x) ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
@@ -85,7 +86,7 @@ static const uint32_t K256[64] =
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
#define SSG2_0z(x) (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
@@ -95,7 +96,7 @@ static const uint32_t K256[64] =
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
#define SSG2_1z(x) ( ror_32(x,17) ^ ror_32(x,19) ^ ((x)>>10) )
#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) )
#define SHA2x_MEXP( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
@@ -449,7 +450,7 @@ void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
if ( clen > len )
clen = len;
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
ptr += clen;
len -= clen;
@@ -486,19 +487,19 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
if ( ptr > pad )
{
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
memset_zero_64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
memset_zero_256( ctx->bufx, pad >> 2 );
memset_zero_64( ctx->bufy, pad >> 2 );
memset_zero_m64( ctx->bufy, pad >> 2 );
memset( ctx->bufz, 0, pad >> 2 );
}
else
{
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
memset_zero_64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
}
@@ -534,3 +535,4 @@ void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
}
#endif
#endif // 0

View File

@@ -36,7 +36,6 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -59,12 +58,7 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
0 };
// Need big endian data
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -73,11 +67,10 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
sha256q_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
@@ -86,27 +79,19 @@ int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
@@ -146,7 +131,6 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -168,13 +152,7 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -183,7 +161,7 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
pdata[19] = n;
sha256q_4way_hash( hash, vdata );
@@ -192,25 +170,16 @@ int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -72,7 +72,7 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
mm256_intrlv_8x32( datax, dataz, dataz, dataz, dataz,
dataz, dataz, dataz, dataz, 640 );
mm64_interleave_2x32( datay, dataz, dataz, 640 );
@@ -156,15 +156,15 @@ void sha256t_8way_hash( void* output, const void* input )
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -187,12 +187,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
0 };
// Need big endian data
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -201,29 +196,22 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256t_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
@@ -253,7 +241,6 @@ void sha256t_4way_hash( void* output, const void* input )
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -262,7 +249,6 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -278,20 +264,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -300,7 +280,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
pdata[19] = n;
sha256t_4way_hash( hash, vdata );
@@ -308,15 +288,13 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
}
}
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}

View File

@@ -2,11 +2,7 @@
bool register_sha256t_algo( algo_gate_t* gate )
{
#if defined(SHA256T_11WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_11way;
gate->hash = (void*)&sha256t_11way_hash;
#elif defined(SHA256T_8WAY)
#if defined(SHA256T_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash;
@@ -25,11 +21,7 @@ gate->optimizations = SHA_OPT;
bool register_sha256q_algo( algo_gate_t* gate )
{
#if defined(SHA256T_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q_8way;
gate->hash = (void*)&sha256q_8way_hash;
#elif defined(SHA256T_4WAY)
#if defined(SHA256T_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q_4way;
gate->hash = (void*)&sha256q_4way_hash;

View File

@@ -5,30 +5,17 @@
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE2__)
#define SHA256T_4WAY
#endif
#if defined(__AVX2__)
#if !defined(__SHA__)
#if defined(__AVX2__)
#define SHA256T_8WAY
// #define SHA256T_11WAY
#endif
#elif defined(__SSE2__)
#define SHA256T_4WAY
#endif
#endif
bool register_sha256t_algo( algo_gate_t* gate );
bool register_sha256q_algo( algo_gate_t* gate );
#if defined(SHA256T_11WAY)
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void *inpz );
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void sha256q_8way_hash( void *output, const void *input );
//int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256T_8WAY)
void sha256t_8way_hash( void *output, const void *input );

View File

@@ -40,7 +40,7 @@
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -3,7 +3,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct {
unsigned char buf[128<<1];

View File

@@ -36,7 +36,7 @@
#ifdef __AES__
#include "sph_shavite.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -6,6 +6,12 @@
#if defined (__AVX2__)
union _m256_v16 {
uint16_t u16[16];
__m256i v256;
};
typedef union _m256_v16 m256_v16;
// imported from simd_iv.h
uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,

View File

@@ -5,7 +5,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct {
uint32_t A[ 32*2 ] __attribute__((aligned(64)));

View File

@@ -2,7 +2,11 @@
#include <string.h>
#include <stdint.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
#if defined(__SHA__)
#include <openssl/sha.h>
#else
#include "algo/sha/sha2-hash-4way.h"
#endif
#if defined (SKEIN_4WAY)
@@ -11,53 +15,69 @@ void skeinhash_4way( void *state, const void *input )
uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
skein512_4way_context ctx_skein;
#if defined(__SHA__)
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
SHA256_CTX ctx_sha256;
#else
sha256_4way_context ctx_sha256;
#endif
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash64 );
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
#if defined(__SHA__)
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
#else
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, state );
mm128_deinterleave_4x32( state, state+32, state+64, state+96,
vhash32, 256 );
#endif
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8];
uint32_t *hash7 = &(hash[7<<2]);
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
int num_found = 0;
// data is 80 bytes, 20 u32 or 4 u64.
swab32_array( edata, pdata, 20 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skeinhash_4way( hash, vdata );
@@ -68,16 +88,14 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -3,24 +3,21 @@
#include <stdint.h>
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__AVX2__)
#define SKEIN_4WAY
#endif
#endif
#if defined(SKEIN_4WAY)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void skeinhash( void *output, const void *input );
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -49,7 +49,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
// Output size in bits
#define SPH_SIZE_skein256 256

View File

@@ -21,8 +21,8 @@ void skeinhash(void *state, const void *input)
memcpy(state, hash, 32);
}
int scanhash_skein(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -31,7 +31,8 @@ int scanhash_skein(int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );
do {

View File

@@ -20,55 +20,43 @@ void skein2hash_4way( void *output, const void *input )
}
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[25]);
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint64_t *edata = (uint64_t*)endiandata;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
swab32_array( endiandata, pdata, 20 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
mm256_bswap_intrlv80_4x64( vdata, pdata );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
skein2hash( hash, vdata );
skein2hash_4way( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
if ( hash7[ lane<<1 ] <= Htarg )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -10,10 +10,9 @@ int64_t skein2_get_max64 ()
bool register_skein2_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
#if defined (FOUR_WAY) && defined (__AVX2__)
#if defined (SKEIN2_4WAY)
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
four_way_not_tested();
#else
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;

View File

@@ -10,11 +10,11 @@
#if defined(SKEIN2_4WAY)
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t* hashes_done );
uint64_t* hashes_done, struct thr_info *mythr );
#endif
void skein2hash( void *output, const void *input );
int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -34,8 +34,8 @@ void skein2hash(void *output, const void *input)
}
int scanhash_skein2(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -44,6 +44,7 @@ int scanhash_skein2(int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -59,7 +59,7 @@
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C" {

View File

@@ -52,7 +52,7 @@
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
/**
* Output size (in bits) for WHIRLPOOL.

View File

@@ -44,12 +44,13 @@ void axiomhash(void *output, const void *input)
}
int scanhash_axiom(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];

View File

@@ -13,26 +13,16 @@
#include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
union _poly_4way_context_overlay
{
skein512_4way_context skein;
shabal512_4way_context shabal;
hashState_echo echo;
luffa_2way_context luffa;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_4way_ctx_holder;
poly_4way_ctx_holder poly_4way_ctx;
void init_polytimos_4way_ctx()
{
skein512_4way_init( &poly_4way_ctx.skein );
shabal512_4way_init( &poly_4way_ctx.shabal );
init_echo( &poly_4way_ctx.echo, 512 );
luffa_2way_init( &poly_4way_ctx.luffa, 512 );
sph_fugue512_init( &poly_4way_ctx.fugue );
sph_gost512_init( &poly_4way_ctx.gost );
}
};
typedef union _poly_4way_context_overlay poly_4way_context_overlay;
void polytimos_4way_hash( void *output, const void *input )
{
@@ -41,51 +31,57 @@ void polytimos_4way_hash( void *output, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
poly_4way_context_overlay ctx;
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// Need to convert from 64 bit interleaved to 32 bit interleaved.
uint32_t vhash32[16*4];
mm256_reinterleave_4x32( vhash32, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
@@ -104,51 +100,43 @@ void polytimos_4way_hash( void *output, const void *input )
memcpy( output+96, hash3, 32 );
}
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done )
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
polytimos_4way_hash(hash, vdata);
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
} while ( ( n < max_nonce-4 ) && !(*restart));
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -4,7 +4,6 @@ bool register_polytimos_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
#ifdef POLYTIMOS_4WAY
init_polytimos_4way_ctx();
gate->scanhash = (void*)&scanhash_polytimos_4way;
gate->hash = (void*)&polytimos_4way_hash;
#else

View File

@@ -13,19 +13,14 @@ bool register_polytimos_algo( algo_gate_t* gate );
#if defined(POLYTIMOS_4WAY)
void polytimos_4way_hash( void *state, const void *input );
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_polytimos_4way_ctx();
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void polytimos_hash( void *state, const void *input );
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_polytimos_ctx();
#endif

View File

@@ -76,13 +76,14 @@ void polytimos_hash(void *output, const void *input)
memcpy(output, hashA, 32);
}
int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;

View File

@@ -40,7 +40,7 @@ void veltor_4way_hash( void *output, const void *input )
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
@@ -78,7 +78,7 @@ void veltor_4way_hash( void *output, const void *input )
}
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -91,6 +91,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
@@ -101,7 +102,7 @@ int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
be32enc( noncep, n );

View File

@@ -15,7 +15,7 @@ bool register_veltor_algo( algo_gate_t* gate );
void veltor_4way_hash( void *state, const void *input );
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_veltor_4way_ctx();
@@ -24,7 +24,7 @@ void init_veltor_4way_ctx();
void veltor_hash( void *state, const void *input );
int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_veltor_ctx();

View File

@@ -61,12 +61,14 @@ void veltor_hash(void *output, const void *input)
memcpy(output, hashB, 32);
}
int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];

View File

@@ -78,7 +78,7 @@ void x14_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -90,7 +90,7 @@ void x14_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
@@ -105,16 +105,16 @@ void x14_4way_hash( void *state, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,13 +142,13 @@ void x14_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -164,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -183,14 +183,14 @@ void x14_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, state );
}
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -203,6 +203,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -212,7 +213,7 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )

View File

@@ -13,19 +13,15 @@ bool register_x14_algo( algo_gate_t* gate );
#if defined(X14_4WAY)
void x14_4way_hash( void *state, const void *input );
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_4way_ctx();
#endif
void x14hash( void *state, const void *input );
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_ctx();
#endif

View File

@@ -180,8 +180,8 @@ void x14hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_x14(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -190,6 +190,7 @@ int scanhash_x14(int thr_id, struct work *work,
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,

View File

@@ -81,7 +81,7 @@ void x15_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -93,7 +93,7 @@ void x15_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
@@ -108,16 +108,16 @@ void x15_4way_hash( void *state, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -145,13 +145,13 @@ void x15_4way_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -167,10 +167,10 @@ void x15_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -214,7 +214,7 @@ void x15_4way_hash( void *state, const void *input )
}
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -227,6 +227,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
@@ -236,7 +237,7 @@ int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )

View File

@@ -13,19 +13,15 @@ bool register_x15_algo( algo_gate_t* gate );
#if defined(X15_4WAY)
void x15_4way_hash( void *state, const void *input );
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_4way_ctx();
#endif
void x15hash( void *state, const void *input );
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_ctx();
#endif

View File

@@ -186,8 +186,8 @@ void x15hash(void *output, const void *input)
memcpy(output, hashB, 32);
}
int scanhash_x15(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -196,6 +196,7 @@ int scanhash_x15(int thr_id, struct work *work,
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,

View File

@@ -32,8 +32,8 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
typedef struct {
union _x16r_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_echo echo;
@@ -50,16 +50,8 @@ typedef struct {
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
} x16r_4way_ctx_holder;
x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64)));
// Cube needs one full init so fast reinits can be done in the hash loop.
void init_x16r_4way_ctx()
{
cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 );
};
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
void x16r_4way_hash( void* output, const void* input )
{
@@ -68,14 +60,14 @@ void x16r_4way_hash( void* output, const void* input )
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x16r_4way_ctx_holder ctx;
x16r_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
int size = 80;
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
if ( s_ntime == UINT32_MAX )
{
@@ -104,11 +96,11 @@ void x16r_4way_hash( void* output, const void* input )
blake512_4way( &ctx.blake, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
@@ -116,11 +108,11 @@ void x16r_4way_hash( void* output, const void* input )
bmw512_4way( &ctx.bmw, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
@@ -142,11 +134,11 @@ void x16r_4way_hash( void* output, const void* input )
skein512_4way( &ctx.skein, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case JH:
jh512_4way_init( &ctx.jh );
@@ -154,11 +146,11 @@ void x16r_4way_hash( void* output, const void* input )
jh512_4way( &ctx.jh, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
@@ -166,21 +158,21 @@ void x16r_4way_hash( void* output, const void* input )
keccak512_4way( &ctx.keccak, input, size );
else
{
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case LUFFA:
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -194,7 +186,7 @@ void x16r_4way_hash( void* output, const void* input )
(const byte*)in2, size );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
(const byte*)in3, size );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
@@ -211,14 +203,14 @@ void x16r_4way_hash( void* output, const void* input )
sph_shavite512_close( &ctx.shavite, hash3 );
break;
case SIMD:
mm256_interleave_2x128( vhash, in0, in1, size<<3 );
mm256_intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, in2, in3, size<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
@@ -235,11 +227,11 @@ void x16r_4way_hash( void* output, const void* input )
(const BitSequence*)in3, size<<3 );
break;
case HAMSI:
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
@@ -256,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_fugue512_close( &ctx.fugue, hash3 );
break;
case SHABAL:
mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
@@ -277,11 +269,11 @@ void x16r_4way_hash( void* output, const void* input )
sph_whirlpool_close( &ctx.whirlpool, hash3 );
break;
case SHA_512:
mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
mm256_intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
size = 64;
@@ -293,7 +285,7 @@ void x16r_4way_hash( void* output, const void* input )
}
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -303,15 +295,14 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
// if ( s_ntime != pdata[17] )
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
@@ -325,30 +316,27 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16r_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
} while ( ( n < max_nonce ) && !(*restart) );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -35,11 +35,9 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
init_x16r_ctx();
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
@@ -52,11 +50,9 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
init_x16r_ctx();
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif

View File

@@ -2,7 +2,7 @@
#define X16R_GATE_H__ 1
#include "algo-gate-api.h"
#include "avxdefs.h"
#include "simd-utils.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
@@ -39,20 +39,14 @@ bool register_x16s_algo( algo_gate_t* gate );
#if defined(X16R_4WAY)
void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x16r_4way_ctx();
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void x16r_hash( void *state, const void *input );
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x16r_ctx();
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -33,7 +33,8 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
typedef struct {
union _x16r_context_overlay
{
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
@@ -55,19 +56,13 @@ typedef struct {
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
} x16r_ctx_holder;
x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
void init_x16r_ctx()
{
cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
};
typedef union _x16r_context_overlay x16r_context_overlay;
void x16r_hash( void* output, const void* input )
{
uint32_t _ALIGN(128) hash[16];
x16r_ctx_holder ctx;
x16r_context_overlay ctx;
void *in = (void*) input;
int size = 80;
@@ -126,7 +121,7 @@ void x16r_hash( void* output, const void* input )
(const BitSequence*)in, size );
break;
case CUBEHASH:
memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
@@ -184,7 +179,7 @@ void x16r_hash( void* output, const void* input )
}
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
@@ -192,16 +187,16 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
// This code is suspicious. s_ntime is saved after byteswapping pdata[17]
// but is tested vs unswapped pdata[17]. This should result in calling
// getAlgoString every pass, but that doesn't seem to be the case.
// It appears to be working correctly as is.
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);

View File

@@ -47,30 +47,6 @@ union _sonoa_4way_context_overlay
};
typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
/*
sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
void init_sonoa_4way_ctx()
{
blake512_4way_init( &sonoa_4way_ctx.blake );
bmw512_4way_init( &sonoa_4way_ctx.bmw );
init_groestl( &sonoa_4way_ctx.groestl, 64 );
skein512_4way_init( &sonoa_4way_ctx.skein );
jh512_4way_init( &sonoa_4way_ctx.jh );
keccak512_4way_init( &sonoa_4way_ctx.keccak );
luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
shavite512_2way_init( &sonoa_4way_ctx.shavite );
simd_2way_init( &sonoa_4way_ctx.simd, 512 );
init_echo( &sonoa_4way_ctx.echo, 512 );
hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
sph_fugue512_init( &sonoa_4way_ctx.fugue );
shabal512_4way_init( &sonoa_4way_ctx.shabal );
sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
sha512_4way_init( &sonoa_4way_ctx.sha512 );
haval256_5_4way_init( &sonoa_4way_ctx.haval );
};
*/
void sonoa_4way_hash( void *state, const void *input )
{
@@ -82,8 +58,6 @@ void sonoa_4way_hash( void *state, const void *input )
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
sonoa_4way_context_overlay ctx;
// sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
// memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
// 1
@@ -95,7 +69,7 @@ void sonoa_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -106,7 +80,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -120,7 +94,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -142,8 +116,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -160,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 2
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -177,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -191,7 +165,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -213,8 +187,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -229,7 +203,7 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
@@ -241,7 +215,7 @@ void sonoa_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -252,7 +226,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -266,7 +240,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -288,8 +262,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -304,13 +278,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -326,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 4
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -343,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -357,7 +331,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -379,8 +353,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -395,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -416,19 +390,19 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm256_reinterleave_4x32_4x64( vhashB, vhash, 512 );
mm256_rintrlv_4x32_4x64( vhashB, vhash, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -443,8 +417,8 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
mm256_intrlv_2x128( vhashA, hash0, hash1, 512 );
mm256_intrlv_2x128( vhashB, hash2, hash3, 512 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
@@ -452,19 +426,19 @@ void sonoa_4way_hash( void *state, const void *input )
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
// 5
mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
mm256_rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_reinterleave_4x64_4x32( vhashB, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -475,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -489,7 +463,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -511,8 +485,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -527,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -548,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -571,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 6
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -588,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -602,7 +576,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -624,8 +598,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -640,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -661,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -682,13 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -705,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 7
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -722,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -736,7 +710,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -758,8 +732,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -774,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -795,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -816,13 +790,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x64_4x32( vhashB, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
@@ -836,7 +810,6 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
@@ -850,19 +823,13 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
sonoa_4way_hash( hash, vdata );
@@ -871,17 +838,10 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;

Some files were not shown because too many files have changed in this diff Show More