mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.8.3
This commit is contained in:
@@ -101,12 +101,8 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
@@ -117,44 +113,21 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
|
||||
allium_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
allium_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
{
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
|
||||
@@ -61,12 +61,8 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
uint32_t *noncep= vdata + 76; // 19*4
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
@@ -79,42 +75,19 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
lyra2h_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
|
||||
be32enc( &edata[19], n );
|
||||
lyra2h_4way_hash( hash, vdata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
|
||||
@@ -93,12 +93,8 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
@@ -111,42 +107,19 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
|
||||
lyra2rev2_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
|
||||
@@ -61,12 +61,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
uint32_t *noncep = vdata + 76; // 19*4
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
@@ -79,42 +75,19 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
lyra2z_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
|
||||
lyra2z_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
}
|
||||
n += 4;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
@@ -126,3 +99,114 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
|
||||
__thread uint64_t* lyra2z_8way_matrix;
|
||||
|
||||
bool lyra2z_8way_thread_init()
|
||||
{
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_8way_context l2z_8way_blake_mid;
|
||||
|
||||
void lyra2z_8way_midstate( const void* input )
|
||||
{
|
||||
blake256_8way_init( &l2z_8way_blake_mid );
|
||||
blake256_8way( &l2z_8way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
|
||||
blake256_8way( &ctx_blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx_blake, vhash );
|
||||
|
||||
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash1, 32 );
|
||||
memcpy( state+160, hash2, 32 );
|
||||
memcpy( state+192, hash3, 32 );
|
||||
memcpy( state+224, hash1, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep = vdata + 152; // 19*8
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
|
||||
lyra2z_8way_midstate( vdata );
|
||||
|
||||
do {
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+1, n+1 );
|
||||
be32enc( noncep+2, n+2 );
|
||||
be32enc( noncep+3, n+3 );
|
||||
be32enc( noncep+4, n+4 );
|
||||
be32enc( noncep+5, n+5 );
|
||||
be32enc( noncep+6, n+6 );
|
||||
be32enc( noncep+7, n+7 );
|
||||
|
||||
lyra2z_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
|
||||
{
|
||||
nonces[ num_found++ ] = n+i;
|
||||
work_set_target_ratio( work, hash+(i<<3) );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,11 @@ void lyra2z_set_target( struct work* work, double job_diff )
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef LYRA2Z_4WAY
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
gate->hash = (void*)&lyra2z_8way_hash;
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
gate->hash = (void*)&lyra2z_4way_hash;
|
||||
|
||||
@@ -1,17 +1,29 @@
|
||||
#ifndef LYRA2Z_GATE_H__
|
||||
#define LYRA2Z_GATE_H__
|
||||
#define LYRA2Z_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX__)
|
||||
#define LYRA2Z_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
// #define LYRA2Z_8WAY
|
||||
#endif
|
||||
|
||||
|
||||
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
|
||||
|
||||
#if defined(LYRA2Z_4WAY)
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool lyra2z_8way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input );
|
||||
|
||||
@@ -20,7 +32,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool lyra2z_4way_thread_init();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void lyra2z_hash( void *state, const void *input );
|
||||
|
||||
@@ -31,3 +43,4 @@ bool lyra2z_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -42,7 +42,7 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
{
|
||||
#if defined (__AVX2__)
|
||||
|
||||
__m256i *state = (__m256i*)State;
|
||||
__m256i* state = (__m256i*)State;
|
||||
|
||||
state[0] = _mm256_setzero_si256();
|
||||
state[1] = _mm256_setzero_si256();
|
||||
@@ -53,7 +53,7 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i* state = (__m128i*)State;
|
||||
|
||||
state[0] = _mm_setzero_si128();
|
||||
state[1] = _mm_setzero_si128();
|
||||
@@ -123,8 +123,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
|
||||
const int len_m256i = len / 32;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||
__m256i *state = (__m256i*)State;
|
||||
__m256i *out = (__m256i*)Out;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* out = (__m256i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
@@ -141,8 +141,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
|
||||
const int len_m128i = len / 16;
|
||||
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i *out = (__m128i*)Out;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* out = (__m128i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
@@ -186,27 +186,19 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
|
||||
{
|
||||
#if defined (__AVX2__)
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)In;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* in = (__m256i*)In;
|
||||
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
state2 = _mm256_xor_si256( state2, in[2] );
|
||||
state[0] = _mm256_xor_si256( state[0], in[0] );
|
||||
state[1] = _mm256_xor_si256( state[1], in[1] );
|
||||
state[2] = _mm256_xor_si256( state[2], in[2] );
|
||||
|
||||
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)In;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
|
||||
state[0] = _mm_xor_si128( state[0], in[0] );
|
||||
state[1] = _mm_xor_si128( state[1], in[1] );
|
||||
@@ -253,26 +245,18 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
|
||||
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
|
||||
#if defined (__AVX2__)
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)In;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i* in = (__m256i*)In;
|
||||
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
state[0] = _mm256_xor_si256( state[0], in[0] );
|
||||
state[1] = _mm256_xor_si256( state[1], in[1] );
|
||||
|
||||
LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)In;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)In;
|
||||
|
||||
state[0] = _mm_xor_si128( state[0], in[0] );
|
||||
state[1] = _mm_xor_si128( state[1], in[1] );
|
||||
@@ -308,7 +292,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
|
||||
* @param state The current state of the sponge
|
||||
* @param rowOut Row to receive the data squeezed
|
||||
*/
|
||||
inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
@@ -317,19 +301,24 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
@@ -341,14 +330,15 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
LYRA_ROUND_AVX2( state0, state1, state2, state3 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
__m128i state1 = _mm_load_si128( &state[1] );
|
||||
__m128i state2 = _mm_load_si128( &state[2] );
|
||||
@@ -358,7 +348,7 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
__m128i state6 = _mm_load_si128( &state[6] );
|
||||
__m128i state7 = _mm_load_si128( &state[7] );
|
||||
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -397,7 +387,7 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t *ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
|
||||
uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -432,31 +422,37 @@ inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
|
||||
* @param rowIn Row to feed the sponge
|
||||
* @param rowOut Row to receive the sponge's output
|
||||
*/
|
||||
inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)rowIn;
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
state0 = _mm256_xor_si256( state0, in[0] );
|
||||
state1 = _mm256_xor_si256( state1, in[1] );
|
||||
@@ -474,14 +470,14 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
out -= BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i state0 = _mm_load_si128( state );
|
||||
__m128i state1 = _mm_load_si128( &state[1] );
|
||||
__m128i state2 = _mm_load_si128( &state[2] );
|
||||
@@ -491,8 +487,8 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
__m128i state6 = _mm_load_si128( &state[6] );
|
||||
__m128i state7 = _mm_load_si128( &state[7] );
|
||||
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -544,8 +540,8 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -604,7 +600,7 @@ inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
|
||||
* @param rowOut Row receiving the output
|
||||
*
|
||||
*/
|
||||
inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
@@ -612,30 +608,35 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i *in = (const __m256i*)rowIn;
|
||||
__m256i *inout = (__m256i*)rowInOut;
|
||||
__m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i t0, t1, t2;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( inout, 1, 0 );
|
||||
__builtin_prefetch( inout +2, 1, 0 );
|
||||
__builtin_prefetch( inout +4, 1, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out -2, 1, 0 );
|
||||
__builtin_prefetch( out -4, 1, 0 );
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* inout = (__m256i*)rowInOut;
|
||||
__m256i* out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( inout +i+6, 1, 0 );
|
||||
__builtin_prefetch( out -i-6, 1, 0 );
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
state0 = _mm256_xor_si256( state0,
|
||||
_mm256_add_epi64( in[0], inout[0] ) );
|
||||
@@ -669,16 +670,16 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
out -= BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *inout = (__m128i*)rowInOut;
|
||||
__m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* inout = (__m128i*)rowInOut;
|
||||
__m128i* out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -690,12 +691,12 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
__m128i* state = (__m128i*)State;
|
||||
|
||||
// For the last round in this function not optimized for AVX
|
||||
const uint64_t *ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -756,9 +757,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
@@ -833,7 +834,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
|
||||
*
|
||||
*/
|
||||
|
||||
inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
@@ -841,30 +842,35 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
|
||||
register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
|
||||
register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
|
||||
register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
|
||||
const __m256i* in = (const __m256i*)rowIn;
|
||||
__m256i *inout = (__m256i*)rowInOut;
|
||||
__m256i *out = (__m256i*)rowOut;
|
||||
__m256i t0, t1, t2;
|
||||
__m256i* state = (__m256i*)State;
|
||||
__m256i state0 = _mm256_load_si256( state );
|
||||
__m256i state1 = _mm256_load_si256( &state[1] );
|
||||
__m256i state2 = _mm256_load_si256( &state[2] );
|
||||
__m256i state3 = _mm256_load_si256( &state[3] );
|
||||
|
||||
__builtin_prefetch( in, 0, 0 );
|
||||
__builtin_prefetch( in +2, 0, 0 );
|
||||
__builtin_prefetch( in +4, 0, 0 );
|
||||
__builtin_prefetch( inout, 1, 0 );
|
||||
__builtin_prefetch( inout +2, 1, 0 );
|
||||
__builtin_prefetch( inout +4, 1, 0 );
|
||||
__builtin_prefetch( out, 1, 0 );
|
||||
__builtin_prefetch( out +2, 1, 0 );
|
||||
__builtin_prefetch( out +4, 1, 0 );
|
||||
__m256i* in = (__m256i*)rowIn;
|
||||
__m256i* inout = (__m256i*)rowInOut;
|
||||
__m256i* out = (__m256i*)rowOut;
|
||||
__m256i t0, t1, t2;
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( in + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + i + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + i + 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
__builtin_prefetch( in +i+6, 0, 0 );
|
||||
__builtin_prefetch( inout +i+6, 1, 0 );
|
||||
__builtin_prefetch( out +i+6, 1, 0 );
|
||||
_mm_prefetch( in + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out + 11, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 11, _MM_HINT_T0 );
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
state0 = _mm256_xor_si256( state0,
|
||||
@@ -900,17 +906,17 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
inout += BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm256_store_si256( casto_m256i( State, 0 ), state0 );
|
||||
_mm256_store_si256( casto_m256i( State, 1 ), state1 );
|
||||
_mm256_store_si256( casto_m256i( State, 2 ), state2 );
|
||||
_mm256_store_si256( casto_m256i( State, 3 ), state3 );
|
||||
_mm256_store_si256( state, state0 );
|
||||
_mm256_store_si256( &state[1], state1 );
|
||||
_mm256_store_si256( &state[2], state2 );
|
||||
_mm256_store_si256( &state[3], state3 );
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
__m128i *state = (__m128i*)State;
|
||||
const __m128i *in = (const __m128i*)rowIn;
|
||||
__m128i *inout = (__m128i*)rowInOut;
|
||||
__m128i *out = (__m128i*)rowOut;
|
||||
__m128i* state = (__m128i*)State;
|
||||
__m128i* in = (__m128i*)rowIn;
|
||||
__m128i* inout = (__m128i*)rowInOut;
|
||||
__m128i* out = (__m128i*)rowOut;
|
||||
|
||||
for ( i = 0; i < 6; i += 3)
|
||||
{
|
||||
@@ -923,9 +929,9 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
}
|
||||
|
||||
// for the last round in this function that isn't optimized for AVX
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++)
|
||||
{
|
||||
@@ -991,9 +997,9 @@ inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
|
||||
|
||||
#else
|
||||
|
||||
uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
|
||||
for ( i = 0; i < nCols; i++)
|
||||
{
|
||||
|
||||
1053
algo/lyra2/sponge.c.bak
Normal file
1053
algo/lyra2/sponge.c.bak
Normal file
File diff suppressed because it is too large
Load Diff
@@ -159,26 +159,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState( uint64_t state[/*16*/] );
|
||||
void initState(uint64_t state[/*16*/]);
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
|
||||
void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlock(uint64_t *state, const uint64_t *in);
|
||||
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
//---- Misc
|
||||
//void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
196
algo/lyra2/sponge.h.bak
Normal file
196
algo/lyra2/sponge.h.bak
Normal file
@@ -0,0 +1,196 @@
|
||||
/**
|
||||
* Header file for Blake2b's internal permutation in the form of a sponge.
|
||||
* This code is based on the original Blake2b's implementation provided by
|
||||
* Samuel Neves (https://blake2.net/)
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#ifndef SPONGE_H_
|
||||
#define SPONGE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define ALIGN __attribute__ ((aligned(32)))
|
||||
#elif defined(_MSC_VER)
|
||||
#define ALIGN __declspec(align(32))
|
||||
#else
|
||||
#define ALIGN
|
||||
#endif
|
||||
|
||||
|
||||
/*Blake2b IV Array*/
|
||||
static const uint64_t blake2b_IV[8] =
|
||||
{
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
/*Blake2b's rotation*/
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#if defined __AVX2__
|
||||
// only available with avx2
|
||||
|
||||
// process 4 columns in parallel
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotr256_1x64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotl256_1x64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotr256_1x64( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
|
||||
#else
|
||||
// only available with avx
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm_rotl256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotr256_1x64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm_rotr256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotl256_1x64( s6, s7 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// Scalar
|
||||
//Blake2b's G function
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*One Round of the Blake2b's compression function*/
|
||||
#define ROUND_LYRA(r) \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
||||
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
|
||||
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
|
||||
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
|
||||
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState( uint64_t state[/*16*/] );
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
|
||||
//---- Misc
|
||||
//void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////TESTS////
|
||||
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
|
||||
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
/////////////
|
||||
|
||||
|
||||
#endif /* SPONGE_H_ */
|
||||
Reference in New Issue
Block a user