mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.6
This commit is contained in:
@@ -32,12 +32,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
// uint32_t HTarget = ptarget[7];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t _ALIGN(32) edata[20];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
@@ -47,18 +47,17 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
// HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( edata, pdata, 20 );
|
||||
|
||||
mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
|
||||
endiandata, 640 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep +2, n+1 );
|
||||
be32enc( noncep +4, n+2 );
|
||||
be32enc( noncep +6, n+3 );
|
||||
be32enc( noncep +1, n+1 );
|
||||
be32enc( noncep +2, n+2 );
|
||||
be32enc( noncep +3, n+3 );
|
||||
|
||||
blakehash_4way( hash, vdata );
|
||||
|
||||
@@ -74,7 +73,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+8)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
@@ -83,7 +82,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+16)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
@@ -92,15 +91,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( (hash+24)[7] == 0 )
|
||||
{
|
||||
if ( fulltest( hash, ptarget ) )
|
||||
if ( fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
}
|
||||
|
||||
n += 4;
|
||||
n += 4;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
|
||||
@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
|
||||
gate->optimizations = FOUR_WAY_OPT;
|
||||
gate->scanhash = (void*)&scanhash_blake_4way;
|
||||
gate->hash = (void*)&blakehash_4way;
|
||||
four_way_not_tested();
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake;
|
||||
gate->hash = (void*)&blakehash;
|
||||
|
||||
@@ -524,18 +524,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
|
||||
_mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
|
||||
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
|
||||
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
|
||||
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
|
||||
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
|
||||
@@ -710,18 +710,18 @@ do { \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
|
||||
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
|
||||
@@ -867,7 +867,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
@@ -915,9 +914,10 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
unsigned z = 0x80 >> n;
|
||||
unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
// unsigned z = 0x80 >> n;
|
||||
// unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
// u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -934,9 +934,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 48 )
|
||||
// if ( ptr <= 48 )
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set_epi32( 0x010000000, 0x01000000,
|
||||
@@ -962,6 +964,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm_byteswap_32( sc->H[k] );
|
||||
// out[k] = sc->H[k];
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -13,46 +13,35 @@ static __thread bool ctx_midstate_done = false;
|
||||
|
||||
void decred_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[4] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[4] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16] __attribute__ ((aligned (64)));
|
||||
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
|
||||
|
||||
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
|
||||
|
||||
void *tail = input + DECRED_MIDSTATE_LEN;
|
||||
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
|
||||
int tail_len = 180 - DECRED_MIDSTATE_LEN;
|
||||
// #define MIDSTATE_LEN 128
|
||||
/*
|
||||
uint8_t *ending = (uint8_t*) input;
|
||||
ending += MIDSTATE_LEN;
|
||||
|
||||
if ( !ctx_midstate_done )
|
||||
{
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
|
||||
ctx_midstate_done = true;
|
||||
}
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
sph_blake256_init( &ctx2 );
|
||||
sph_blake256( &ctx2, sin0, 180 );
|
||||
sph_blake256_close( &ctx2, hash );
|
||||
|
||||
*/
|
||||
/*
|
||||
blake256_4way_init( &ctx );
|
||||
blake256_4way( &ctx, input, 180 );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
|
||||
*/
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
/*
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -66,22 +55,21 @@ printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
|
||||
printf("\n");
|
||||
*/
|
||||
|
||||
// memcpy( state, hash0, 32 );
|
||||
// memcpy( state+32, hash1, 32 );
|
||||
// memcpy( state+64, hash1, 32 );
|
||||
// memcpy( state+96, hash1, 32 );
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
memcpy( state, hash, 32 );
|
||||
// memcpy( state, hash, 32 );
|
||||
|
||||
}
|
||||
|
||||
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) endiandata[48];
|
||||
// uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t _ALIGN(64) edata[48];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
|
||||
@@ -91,28 +79,25 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
|
||||
// #define DCR_NONCE_OFT32 35
|
||||
|
||||
ctx_midstate_done = false;
|
||||
|
||||
// memcpy(endiandata, pdata, 180);
|
||||
memcpy( edata, pdata, 180 );
|
||||
|
||||
// use the old way until new way updated for size.
|
||||
mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
* noncep = n;
|
||||
*(noncep+2) = n+1;
|
||||
*(noncep+4) = n+2;
|
||||
*(noncep+6) = n+3;
|
||||
*(noncep+1) = n+1;
|
||||
*(noncep+2) = n+2;
|
||||
*(noncep+3) = n+3;
|
||||
|
||||
decred_hash_4way( hash, vdata );
|
||||
|
||||
// endiandata[DCR_NONCE_OFT32] = n;
|
||||
// decred_hash(hash32, endiandata);
|
||||
|
||||
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
@@ -121,29 +106,47 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
nonces[0] = n;
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
}
|
||||
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
/*
|
||||
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
|
||||
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n;
|
||||
nonces[1] = n+1;
|
||||
}
|
||||
*/
|
||||
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n;
|
||||
nonces[2] = n+2;
|
||||
}
|
||||
/*
|
||||
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
printf("found 3\n");
|
||||
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
|
||||
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
|
||||
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
|
||||
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n;
|
||||
nonces[3] = n+3;
|
||||
}
|
||||
*/
|
||||
n += 4;
|
||||
n += 2;
|
||||
// n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce)
|
||||
&& !work_restart[thr_id].restart );
|
||||
|
||||
|
||||
@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
|
||||
} else {
|
||||
sc->T0 -= 512 - bit_len;
|
||||
}
|
||||
|
||||
if (bit_len <= 446) {
|
||||
memset(u.buf + ptr + 1, 0, 55 - ptr);
|
||||
if (out_size_w32 == 8)
|
||||
|
||||
Reference in New Issue
Block a user