mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.5
This commit is contained in:
@@ -95,13 +95,13 @@ extern "C"{
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
|
||||
x3 = mm256_bitnot( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
|
||||
@@ -532,7 +532,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
|
||||
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
@@ -546,7 +546,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
|
||||
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>3);
|
||||
len -= clen;
|
||||
@@ -579,7 +579,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
else
|
||||
numz = 112 - sc->ptr;
|
||||
|
||||
memset_zero_m256i( buf+1, (numz>>3) - 1 );
|
||||
memset_zero_256( buf+1, (numz>>3) - 1 );
|
||||
|
||||
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
|
||||
l1 = SPH_T64(sc->block_count >> 55);
|
||||
@@ -593,7 +593,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
for ( u=0; u < 8; u++ )
|
||||
buf[u] = sc->H[u+8];
|
||||
|
||||
memcpy_m256i( dst256, buf, 8 );
|
||||
memcpy_256( dst256, buf, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
|
@@ -1,11 +1,12 @@
|
||||
#if defined(JHA_4WAY)
|
||||
|
||||
#include "jha-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "avxdefs.h"
|
||||
//#include "avxdefs.h"
|
||||
|
||||
#if defined(JHA_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -15,19 +16,19 @@
|
||||
//static __thread keccak512_4way_context jha_kec_mid
|
||||
// __attribute__ ((aligned (64)));
|
||||
|
||||
void jha_hash_4way( void *output, const void *input )
|
||||
void jha_hash_4way( void *out, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i mask;
|
||||
__m256i* vh256 = (__m256i*)vhash;
|
||||
__m256i* vha256 = (__m256i*)vhasha;
|
||||
__m256i* vhb256 = (__m256i*)vhashb;
|
||||
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
|
||||
__m256i mask0, mask1;
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vh0 = (__m256i*)vhash0;
|
||||
__m256i* vh1 = (__m256i*)vhash1;
|
||||
|
||||
blake512_4way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
@@ -40,21 +41,29 @@ void jha_hash_4way( void *output, const void *input )
|
||||
keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
|
||||
// keccak512_4way( &ctx_keccak, input+64, 16 );
|
||||
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
|
||||
// keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
// Heavy & Light Pair Loop
|
||||
for ( int round = 0; round < 3; round++ )
|
||||
{
|
||||
memset_zero_m256i( vha256, 20 );
|
||||
memset_zero_m256i( vhb256, 20 );
|
||||
// memset_zero_256( vh0, 20 );
|
||||
// memset_zero_256( vh1, 20 );
|
||||
|
||||
mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
|
||||
mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
|
||||
// positive logic, if maski select vhi
|
||||
// going from bit to mask reverses logic such that if the test bit is set
|
||||
// zero will be put in mask0, meaning don't take vh0. mask1 is
|
||||
// inverted so 1 will be put in mask1 meaning take it.
|
||||
mask0 = mm256_negate_64(
|
||||
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
|
||||
mask1 = mm256_not( mask0 );
|
||||
|
||||
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
|
||||
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
|
||||
|
||||
// groestl (serial) v skein
|
||||
|
||||
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash0,
|
||||
@@ -71,58 +80,66 @@ void jha_hash_4way( void *output, const void *input )
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
|
||||
m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
|
||||
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// skein
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way( &ctx_skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx_skein, vhashb );
|
||||
skein512_4way_close( &ctx_skein, vhash1 );
|
||||
|
||||
// merge vectored hash
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
_mm256_and_si256( vh1[i], mask1 ) );
|
||||
/*
|
||||
vha256[i] = _mm256_maskload_epi64(
|
||||
vhasha + i*4, mm256_bitnot(mask ) );
|
||||
vhasha + i*4, mm256_not( mask ) );
|
||||
vhb256[i] = _mm256_maskload_epi64(
|
||||
vhashb + i*4, mask );
|
||||
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
|
||||
*/
|
||||
}
|
||||
|
||||
// blake v jh
|
||||
|
||||
blake512_4way_init( &ctx_blake );
|
||||
blake512_4way( &ctx_blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx_blake, vhasha );
|
||||
blake512_4way_close( &ctx_blake, vhash0 );
|
||||
|
||||
jh512_4way_init( &ctx_jh );
|
||||
jh512_4way( &ctx_jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx_jh, vhashb );
|
||||
jh512_4way_close( &ctx_jh, vhash1 );
|
||||
|
||||
// merge vectored hash
|
||||
// merge hash
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
{
|
||||
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
|
||||
_mm256_and_si256( vh1[i], mask1 ) );
|
||||
/*
|
||||
vha256[i] = _mm256_maskload_epi64(
|
||||
vhasha + i*4, mm256_bitnot(mask ) );
|
||||
vhasha + i*4, mm256_not( mask ) );
|
||||
vhb256[i] = _mm256_maskload_epi64(
|
||||
vhashb + i*4, mask );
|
||||
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
// memcpy( output, hash0, 32 );
|
||||
// memcpy( output+32, hash1, 32 );
|
||||
// memcpy( output+64, hash2, 32 );
|
||||
// memcpy( output+96, hash3, 32 );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -160,7 +177,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// precalc midstate for keccak
|
||||
// keccak512_4way_init( &jha_kec_mid );
|
||||
|
@@ -3,15 +3,16 @@
|
||||
|
||||
bool register_jha_algo( algo_gate_t* gate )
|
||||
{
|
||||
//#if defined (JHA_4WAY)
|
||||
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_jha_4way;
|
||||
// gate->hash = (void*)&jha_hash_4way;
|
||||
//#else
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
#if defined (JHA_4WAY)
|
||||
four_way_not_tested();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_jha_4way;
|
||||
gate->hash = (void*)&jha_hash_4way;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_jha;
|
||||
gate->hash = (void*)&jha_hash;
|
||||
//#endif
|
||||
#endif
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
return true;
|
||||
};
|
||||
|
@@ -9,19 +9,17 @@
|
||||
#define JHA_4WAY
|
||||
#endif
|
||||
|
||||
//#if defined JHA_4WAY
|
||||
//void jha_hash_4way( void *state, const void *input );
|
||||
#if defined JHA_4WAY
|
||||
void jha_hash_4way( void *state, const void *input );
|
||||
|
||||
//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done );
|
||||
//#else
|
||||
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
#endif
|
||||
|
||||
void jha_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
//#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user