This commit is contained in:
Jay D Dee
2019-06-13 11:20:27 -04:00
parent 7fec680835
commit b2331375a3
70 changed files with 4413 additions and 4360 deletions

View File

@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,26 +68,23 @@ void allium_4way_hash( void *state, const void *input )
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -94,7 +92,6 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -106,13 +103,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
@@ -124,7 +115,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
if ( fulltest( hash+(lane<<3), ptarget ) )
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr, lane );

View File

@@ -5,7 +5,9 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__)
//#if defined(__AVX2__)
#if defined(__SSE2__)
#define LYRA2REV3_4WAY
#endif

View File

@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__SSE4_2__)
#elif defined(__SSE2__)
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset( wholeMatrix, 0, i );

View File

@@ -36,17 +36,16 @@ void lyra2h_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
32, 16, 16, 16 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,49 +53,36 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep= vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 20; i++ )
be32enc( &edata[i], pdata[i] );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2h_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( &edata[19], n );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -6,7 +6,7 @@
#include "algo/keccak/sph_keccak.h"
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif

View File

@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,11 +78,10 @@ void lyra2rev2_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -86,49 +89,44 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -35,7 +35,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,10 +55,9 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -66,7 +65,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
@@ -80,15 +78,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
// Need big endian data
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -99,16 +89,14 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -36,17 +36,12 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -54,7 +49,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -66,13 +60,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2z_4way_midstate( vdata );
do {
@@ -82,16 +70,11 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, i );
else
applog( LOG_WARNING, "Failed to submit share." );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
@@ -136,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -163,7 +146,6 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -175,13 +157,7 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
@@ -191,7 +167,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
submit_solution( work, hash+(i<<3), mythr, i );

View File

@@ -3,7 +3,7 @@
#include "lyra2-gate.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z_matrix;

View File

@@ -1,7 +1,7 @@
#include <memory.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z330_wholeMatrix;
@@ -30,14 +30,17 @@ int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
if (opt_benchmark)
ptarget[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
do
{
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
{
work_set_target_ratio(work, hash);
pdata[19] = nonce;

View File

@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
unsigned char _ALIGN(128) hashA[64];
unsigned char _ALIGN(128) hashB[64];
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
sph_jh512_close( &ctx.jh, (void*)hash );
if ( hash[0] & 1 )
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
sph_gost512_close( &ctx.gost, (void*)hash );
}
else
{
else
{
#if defined(__AES__)
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo1, (const void*)hash, 64 );
sph_echo512_close( &ctx.echo1, (void*)hash );

View File

@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
blake2b_IV[5], blake2b_IV[4] );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes remaining bytes
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
const int len_m128i = len / 16;
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* in = (__m128i*)rowIn;
__m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
__m128i* state = (__m128i*)State;
// For the last round in this function not optimized for AVX
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
// uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
// uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
// uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out[4] = _mm_xor_si128( state[4], in[4] );
out[5] = _mm_xor_si128( state[5], in[5] );
__m128i t0, t1;
t0 = _mm_srli_si128( state[0], 8 );
t1 = _mm_srli_si128( state[1], 8 );
inout[0] = _mm_xor_si128( inout[0],
_mm_or_si128( _mm_slli_si128( state[0], 8 ),
_mm_srli_si128( state[5], 8 ) ) );
inout[1] = _mm_xor_si128( inout[1],
_mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
t0 = _mm_srli_si128( state[2], 8 );
inout[2] = _mm_xor_si128( inout[2],
_mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
t1 = _mm_srli_si128( state[3], 8 );
inout[3] = _mm_xor_si128( inout[3],
_mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
t0 = _mm_srli_si128( state[4], 8 );
inout[4] = _mm_xor_si128( inout[4],
_mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
inout[5] = _mm_xor_si128( inout[5],
_mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
/*
ptrWordInOut[0] ^= State[11];
ptrWordInOut[1] ^= State[0];
ptrWordInOut[2] ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
*/
inout += BLOCK_LEN_M128I;
in += BLOCK_LEN_M128I;
out -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined(__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)rowIn;

View File

@@ -23,7 +23,7 @@
#define SPONGE_H_
#include <stdint.h>
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
a = _mm256_add_epi64( a, b ); \