mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.21.3
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
#include "blake2s-gate.h"
|
||||
#include "blake2s-hash-4way.h"
|
||||
//#include "sph-blake2s.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -7,6 +8,43 @@
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
/*
|
||||
static blake2s_16way_state blake2s_16w_ctx;
|
||||
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
|
||||
*/
|
||||
/*
|
||||
int blake2s_16way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
blake2s_16w_ctx.t[0] = 64;
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
int blake2s_16way_prehash( struct work *work )
|
||||
{
|
||||
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
@@ -30,10 +68,40 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
/*
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
|
||||
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
|
||||
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
|
||||
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
|
||||
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
/*
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 640 );
|
||||
blake2s_16w_ctx.t[0] = 64;
|
||||
*/
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
@@ -63,6 +131,36 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
/*
|
||||
static blake2s_8way_state blake2s_8w_ctx;
|
||||
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
|
||||
|
||||
int blake2s_8way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
|
||||
|
||||
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
|
||||
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
|
||||
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
|
||||
|
||||
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
|
||||
// edata, edata, edata, edata, 640 );
|
||||
blake2s_8w_ctx.t[0] = 64;
|
||||
}
|
||||
*/
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
@@ -86,10 +184,41 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
/*
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
/*
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
|
||||
|
||||
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
|
||||
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
|
||||
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
|
||||
|
||||
|
||||
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
|
||||
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
// intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||
// edata, edata, edata, edata, 640 );
|
||||
|
||||
blake2s_8w_ctx.t[0] = 64;
|
||||
*/
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
@@ -117,7 +246,25 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
/*
|
||||
static blake2s_4way_state blake2s_4w_ctx;
|
||||
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
|
||||
|
||||
int blake2s_4way_prehash( struct work *work )
|
||||
{
|
||||
uint32_t edata[20] __attribute__ ((aligned (64)));
|
||||
blake2s_state ctx;
|
||||
mm128_bswap32_80( edata, work->data );
|
||||
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
|
||||
ctx.buflen = ctx.t[0] = 64;
|
||||
blake2s_compress( &ctx, (const uint8_t*)edata );
|
||||
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
|
||||
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
|
||||
blake2s_4w_ctx.t[0] = 64;
|
||||
}
|
||||
*/
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
@@ -140,11 +287,15 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
/*
|
||||
pthread_rwlock_rdlock( &g_work_lock );
|
||||
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
*/
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
@@ -5,13 +5,15 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
// gate->prehash = (void*)&blake2s_16way_prehash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
// gate->prehash = (void*)&blake2s_8way_prehash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
// gate->prehash = (void*)&blake2s_4way_prehash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
|
||||
@@ -23,18 +23,22 @@ bool register_blake2s_algo( algo_gate_t* gate );
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_16way_prehash( struct work * );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_8way_prehash( struct work * );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
int blake2s_4way_prehash( struct work * );
|
||||
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
|
||||
@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
@@ -120,7 +120,7 @@ do { \
|
||||
|
||||
#define ROUND4W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
@@ -317,8 +317,8 @@ do { \
|
||||
|
||||
#define G8W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
@@ -331,7 +331,7 @@ do { \
|
||||
|
||||
#define ROUND8W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
const uint8_t s0 = sigma0; \
|
||||
const uint8_t s1 = sigma1; \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
@@ -543,7 +543,7 @@ do { \
|
||||
|
||||
#define ROUND16W(r) \
|
||||
do { \
|
||||
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
|
||||
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
|
||||
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph-blake2s.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <inttypes.h>
|
||||
@@ -33,7 +34,7 @@
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
@@ -42,6 +43,13 @@ enum blake2s_constant
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
*/
|
||||
|
||||
#define BLAKE2S_BLOCKBYTES 64
|
||||
#define BLAKE2S_OUTBYTES 32
|
||||
#define BLAKE2S_KEYBYTES 32
|
||||
#define BLAKE2S_SALTBYTES 8
|
||||
#define BLAKE2S_PERSONALBYTES 8
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_nway_param
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static __thread blake2s_state blake2s_ctx;
|
||||
//static __thread blake2s_state s_ctx;
|
||||
#define MIDLEN 76
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
@@ -19,37 +17,27 @@ void blake2s_hash( void *output, const void *input )
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
|
||||
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
|
||||
// blake2s_update(&ctx, input, 80);
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
/*
|
||||
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
{
|
||||
s_ctx.buflen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
@@ -58,11 +46,12 @@ int scanhash_blake2s( struct work *work,
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
if (hash64[7] <= Htarg )
|
||||
if ( fulltest(hash64, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "sph-blake2s.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
{
|
||||
@@ -225,6 +226,71 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if 0
|
||||
//#if defined(__SSE2__) // always true
|
||||
|
||||
The only application for this is to do a prehash for the blake2s algorithm.
|
||||
SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
|
||||
Testing has found that using this serial SIMD code for prehash is slower than
|
||||
doing a parallel hash. A parallel hash has more instructions and uses more
|
||||
data. The serial hash uses fewer instructions and data and only needs to
|
||||
interleave the final hash into parallel streams. This has shown negligible
|
||||
improvement on other algos, notably blake256 which is almost identical.
|
||||
Considering the low frequency of prehash no statistically valid change
|
||||
was expected. It was simply better on paper.
|
||||
|
||||
Furthermore, simply defining this macro has an additional negative effect on
|
||||
blake2s as a whole. There are no references to this macro, blake2s-4way does
|
||||
not include it in any header files, it's just another unused macro which should
|
||||
have no effect beyond the preprocessor. But just being visible to the compiler
|
||||
changes things in a dramatic way.
|
||||
|
||||
These 2 things combined reduced the hash rate for blake2s by more than 5% when
|
||||
using serial SIMD for the blake2s prehash over 16way parallel prehash.
|
||||
16way parallel hashing was used in the high frequency nonce loop in both cases.
|
||||
Comsidering the prehash represents 50% of the algorithm and is done once vs
|
||||
the high frequency second half that is done mega, maybe giga, times more it's
|
||||
hard to imagine that big of an effect in either direction.
|
||||
|
||||
#define ROUND( r ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
const uint8_t *sigma = blake2s_sigma[r]; \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
|
||||
m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
|
||||
m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[3] = mm128_shufll_32( V[3] ); \
|
||||
V[2] = mm128_swap_64( V[2] ); \
|
||||
V[1] = mm128_shuflr_32( V[1] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
|
||||
m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
|
||||
_mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
|
||||
m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[3] = mm128_shuflr_32( V[3] ); \
|
||||
V[2] = mm128_swap_64( V[2] ); \
|
||||
V[1] = mm128_shufll_32( V[1] ); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||
@@ -247,7 +313,10 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
ROUND( 0 );
|
||||
|
||||
#endif
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
|
||||
@@ -91,6 +91,7 @@ static inline void secure_zero_memory(void *v, size_t n)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
enum blake2s_constant
|
||||
{
|
||||
BLAKE2S_BLOCKBYTES = 64,
|
||||
@@ -99,6 +100,13 @@ extern "C" {
|
||||
BLAKE2S_SALTBYTES = 8,
|
||||
BLAKE2S_PERSONALBYTES = 8
|
||||
};
|
||||
*/
|
||||
|
||||
#define BLAKE2S_BLOCKBYTES 64
|
||||
#define BLAKE2S_OUTBYTES 32
|
||||
#define BLAKE2S_KEYBYTES 32
|
||||
#define BLAKE2S_SALTBYTES 8
|
||||
#define BLAKE2S_PERSONALBYTES 8
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2s_param
|
||||
|
||||
@@ -103,16 +103,16 @@
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2] ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3] ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7] ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6] ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user