mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.5
This commit is contained in:
@@ -377,7 +377,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
@@ -385,7 +385,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
#else
|
||||
memset(wholeMatrix, 0, i);
|
||||
#endif
|
||||
|
||||
*/
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
|
||||
@@ -128,34 +128,10 @@ void lyra2re_set_target ( struct work* work, double job_diff )
|
||||
work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
/*
|
||||
bool lyra2re_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
lyra2re_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( lyra2re_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( lyra2re_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_lyra2re_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2re_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
// gate->miner_thread_init = (void*)&lyra2re_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2re;
|
||||
gate->hash = (void*)&lyra2re_hash;
|
||||
gate->get_max64 = (void*)&lyra2re_get_max64;
|
||||
|
||||
@@ -132,23 +132,13 @@ bool lyra2rev2_thread_init()
|
||||
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
l2v2_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( l2v2_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
|
||||
#elif defined (__AVX__)
|
||||
memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( l2v2_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
return l2v2_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_lyra2rev2_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
|
||||
168
algo/lyra2/lyra2z-4way.c
Normal file
168
algo/lyra2/lyra2z-4way.c
Normal file
@@ -0,0 +1,168 @@
|
||||
#include "lyra2z-gate.h"
|
||||
|
||||
#ifdef LYRA2Z_4WAY
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
//#include "avxdefs.h"
|
||||
|
||||
// same size, only difference is the name, lyra2 is done serially
|
||||
__thread uint64_t* lyra2z_4way_matrix;
|
||||
|
||||
bool lyra2z_4way_thread_init()
|
||||
{
|
||||
return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2z_4way_blake_mid;
|
||||
|
||||
void lyra2z_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2z_4way_blake_mid );
|
||||
blake256_4way( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void lyra2z_4way_hash( void *state, const void *input )
|
||||
{
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
// blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
// blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
blake256_4way_init( &ctx_blake );
|
||||
blake256_4way( &ctx_blake, input, 80 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
// memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 76; // 19*4
|
||||
uint32_t *noncep1 = vdata + 77;
|
||||
uint32_t *noncep2 = vdata + 78;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &edata[i], pdata[i] );
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// lyra2z_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
be32enc( &edata[19], n );
|
||||
lyra2z_4way_hash( hash, vdata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
printf("found 0\n");
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
*/
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
printf("found 2\n");
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
/*
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
printf("found 3\n");
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
*/
|
||||
n += 2;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
28
algo/lyra2/lyra2z-gate.c
Normal file
28
algo/lyra2/lyra2z-gate.c
Normal file
@@ -0,0 +1,28 @@
|
||||
#include "lyra2z-gate.h"
|
||||
#include "lyra2.h"
|
||||
|
||||
void lyra2z_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef LYRA2Z_4WAY
|
||||
four_way_not_tested();
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
gate->hash = (void*)&lyra2z_4way_hash;
|
||||
#else
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2z_set_target;
|
||||
return true;
|
||||
};
|
||||
|
||||
33
algo/lyra2/lyra2z-gate.h
Normal file
33
algo/lyra2/lyra2z-gate.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef LYRA2Z_GATE_H__
|
||||
#define LYRA2Z_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY)
|
||||
#define LYRA2Z_4WAY
|
||||
#endif
|
||||
|
||||
|
||||
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
|
||||
|
||||
#if defined(LYRA2Z_4WAY)
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool lyra2z_4way_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
void lyra2z_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool lyra2z_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,40 +1,49 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2z-gate.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
__thread uint64_t* zcoin_wholeMatrix;
|
||||
__thread uint64_t* lyra2z_matrix;
|
||||
|
||||
static __thread sph_blake256_context zcoin_blake_mid;
|
||||
|
||||
|
||||
void zcoin_midstate( const void* input )
|
||||
bool lyra2z_thread_init()
|
||||
{
|
||||
sph_blake256_init( &zcoin_blake_mid );
|
||||
sph_blake256( &zcoin_blake_mid, input, 64 );
|
||||
// const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
|
||||
lyra2z_matrix = _mm_malloc( i, 64 );
|
||||
return lyra2z_matrix;
|
||||
}
|
||||
|
||||
static __thread sph_blake256_context lyra2z_blake_mid;
|
||||
|
||||
void lyra2z_midstate( const void* input )
|
||||
{
|
||||
sph_blake256_init( &lyra2z_blake_mid );
|
||||
sph_blake256( &lyra2z_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void zcoin_hash(void *state, const void *input )
|
||||
void lyra2z_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
|
||||
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
|
||||
memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
|
||||
sph_blake256( &ctx_blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
|
||||
LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
@@ -52,11 +61,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
zcoin_midstate( endiandata );
|
||||
lyra2z_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
zcoin_hash( hash, endiandata );
|
||||
lyra2z_hash( hash, endiandata );
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
@@ -73,50 +82,41 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
|
||||
|
||||
void zcoin_set_target( struct work* work, double job_diff )
|
||||
void lyra2z_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
|
||||
}
|
||||
/*
|
||||
|
||||
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
|
||||
{
|
||||
work->height = sctx->bloc_height;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
bool zcoin_thread_init()
|
||||
|
||||
bool lyra2z_thread_init()
|
||||
{
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
zcoin_wholeMatrix = _mm_malloc( i, 64 );
|
||||
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( zcoin_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( zcoin_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
return lyra2z_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_zcoin_algo( algo_gate_t* gate )
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&zcoin_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_zcoin;
|
||||
gate->hash = (void*)&zcoin_hash;
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&zcoin_set_target;
|
||||
gate->set_target = (void*)&lyra2z_set_target;
|
||||
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
@@ -64,22 +64,12 @@ bool lyra2z330_thread_init()
|
||||
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
|
||||
lyra2z330_wholeMatrix = _mm_malloc( i, 64 );
|
||||
|
||||
if ( lyra2z330_wholeMatrix == NULL )
|
||||
return false;
|
||||
|
||||
#if defined (__AVX2__)
|
||||
memset_zero_m256i( (__m256i*)lyra2z330_wholeMatrix, i/32 );
|
||||
#elif defined(__AVX__)
|
||||
memset_zero_m128i( (__m128i*)lyra2z330_wholeMatrix, i/16 );
|
||||
#else
|
||||
memset( lyra2z330_wholeMatrix, 0, i );
|
||||
#endif
|
||||
return true;
|
||||
return lyra2z330_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2z330_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z330;
|
||||
gate->hash = (void*)&lyra2z330_hash;
|
||||
|
||||
@@ -130,12 +130,12 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_m256i( out, state, BLOCK_LEN_M256I );
|
||||
memcpy_256( out, state, BLOCK_LEN_M256I );
|
||||
LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_m256i( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
|
||||
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
|
||||
|
||||
#elif defined (__AVX__)
|
||||
|
||||
@@ -148,13 +148,13 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_m128i( out, state, BLOCK_LEN_M128I );
|
||||
memcpy_128( out, state, BLOCK_LEN_M128I );
|
||||
LYRA_ROUND_AVX( state[0], state[1], state[2], state[3],
|
||||
state[4], state[5], state[6], state[7] );
|
||||
out += BLOCK_LEN_M128I;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_m128i( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
|
||||
memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -66,11 +66,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotl256_1x64( s1); \
|
||||
s2 = mm256_swap128( s2 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotr256_1x64( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm256_rotr256_1x64( s1 ); \
|
||||
s2 = mm256_swap128( s2 ); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
s3 = mm256_rotl256_1x64( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -105,14 +105,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rotl256_1x64( s2, s3 ); \
|
||||
mm128_swap128( s4, s5 ); \
|
||||
mm128_rotr256_1x64( s6, s7 ); \
|
||||
mm_rotl256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotr256_1x64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm128_rotr256_1x64( s2, s3 ); \
|
||||
mm128_swap128( s4, s5 ); \
|
||||
mm128_rotl256_1x64( s6, s7 );
|
||||
mm_rotr256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotl256_1x64( s6, s7 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
||||
Reference in New Issue
Block a user