This commit is contained in:
Jay D Dee
2023-09-28 18:43:18 -04:00
parent be88afc349
commit bc5a5c6df8
88 changed files with 5526 additions and 3361 deletions

View File

@@ -1,5 +1,5 @@
#include "blake-gate.h"
#include "blake-hash-4way.h"
#include "blake256-hash.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>

View File

@@ -1,5 +1,5 @@
#ifndef BLAKE_HASH_4WAY__
#define BLAKE_HASH_4WAY__ 1
#ifndef BLAKE256_HASH__
#define BLAKE256_HASH__ 1
#include <stddef.h>
#include "simd-utils.h"
@@ -11,13 +11,6 @@
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds );
/////////////////////////
//
// Blake-512 1 way SSE2
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
//////////////////////////
//
// Blake-256 4 way SSE2
@@ -86,30 +79,6 @@ void blake256r8_8way_init(void *cc);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way AVX2
typedef struct {
__m256i buf[16];
__m256i H[8];
__m256i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_4way_big_context __attribute__ ((aligned (128)));
typedef blake_4way_big_context blake512_4way_context;
void blake512_4way_init( blake_4way_big_context *sc );
void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////
@@ -137,7 +106,6 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
void blake256r14_16way_init(void *cc);
@@ -150,33 +118,7 @@ void blake256r8_16way_init(void *cc);
void blake256r8_16way_update(void *cc, const void *data, size_t len);
void blake256r8_16way_close(void *cc, void *dst);
////////////////////////////
//
//// Blake-512 8 way AVX512
typedef struct {
__m512i buf[16];
__m512i H[8];
__m512i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));
typedef blake_8way_big_context blake512_8way_context;
void blake512_8way_init( blake_8way_big_context *sc );
void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2
#endif // BLAKE_HASH_4WAY_H__
#endif // BLAKE256_HASH_H__

View File

@@ -1,113 +0,0 @@
/**
* Blake2-B Implementation
* tpruvot@github 2015-2016
*/
#include "blake2b-gate.h"
#include <string.h>
#include <stdint.h>
#include "blake2b-hash-4way.h"
#if defined(BLAKE2B_8WAY)
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2B_4WAY)
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -1,20 +0,0 @@
#include "blake2b-gate.h"
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_8WAY)
gate->scanhash = (void*)&scanhash_blake2b_8way;
// gate->hash = (void*)&blake2b_8way_hash;
#elif defined(BLAKE2B_4WAY)
gate->scanhash = (void*)&scanhash_blake2b_4way;
gate->hash = (void*)&blake2b_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
gate->optimizations = AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,34 +0,0 @@
#ifndef __BLAKE2B_GATE_H__
#define __BLAKE2B_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
#endif
bool register_blake2b_algo( algo_gate_t* gate );
#if defined(BLAKE2B_8WAY)
//void blake2b_8way_hash( void *state, const void *input );
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BLAKE2B_4WAY)
void blake2b_4way_hash( void *state, const void *input );
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void blake2b_hash( void *state, const void *input );
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -31,7 +31,7 @@
#include <stdint.h>
#include <string.h>
#include "blake2b-hash-4way.h"
#include "blake2b-hash.h"
#if defined(__AVX2__)
@@ -252,17 +252,17 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
v[ 8] = v512_64( 0x6A09E667F3BCC908 );
v[ 9] = v512_64( 0xBB67AE8584CAA73B );
v[10] = v512_64( 0x3C6EF372FE94F82B );
v[11] = v512_64( 0xA54FF53A5F1D36F1 );
v[12] = v512_64( 0x510E527FADE682D1 );
v[13] = v512_64( 0x9B05688C2B3E6C1F );
v[14] = v512_64( 0x1F83D9ABFB41BD6B );
v[15] = v512_64( 0x5BE0CD19137E2179 );
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
v[12] = _mm512_xor_si512( v[12], v512_64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], v512_64( ctx->t[1] ) );
if ( last )
v[14] = mm512_not( v[14] );
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
{
size_t i;
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
ctx->h[0] = v512_64( 0x6A09E667F3BCC908 );
ctx->h[1] = v512_64( 0xBB67AE8584CAA73B );
ctx->h[2] = v512_64( 0x3C6EF372FE94F82B );
ctx->h[3] = v512_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = v512_64( 0x510E527FADE682D1 );
ctx->h[5] = v512_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = v512_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = v512_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], v512_64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;
@@ -419,17 +419,17 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
v[ 8] = v256_64( 0x6A09E667F3BCC908 );
v[ 9] = v256_64( 0xBB67AE8584CAA73B );
v[10] = v256_64( 0x3C6EF372FE94F82B );
v[11] = v256_64( 0xA54FF53A5F1D36F1 );
v[12] = v256_64( 0x510E527FADE682D1 );
v[13] = v256_64( 0x9B05688C2B3E6C1F );
v[14] = v256_64( 0x1F83D9ABFB41BD6B );
v[15] = v256_64( 0x5BE0CD19137E2179 );
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
v[12] = _mm256_xor_si256( v[12], v256_64( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], v256_64( ctx->t[1] ) );
if ( last )
v[14] = mm256_not( v[14] );
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
{
size_t i;
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
ctx->h[0] = v256_64( 0x6A09E667F3BCC908 );
ctx->h[1] = v256_64( 0xBB67AE8584CAA73B );
ctx->h[2] = v256_64( 0x3C6EF372FE94F82B );
ctx->h[3] = v256_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = v256_64( 0x510E527FADE682D1 );
ctx->h[5] = v256_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = v256_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = v256_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], v256_64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;

View File

@@ -1,64 +1,175 @@
/**
* Blake2-B Implementation
* tpruvot@github 2015-2016
*/
#include "blake2b-gate.h"
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
#include "algo-gate-api.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
#include "blake2b-hash.h"
#define MIDLEN 76
#define A 64
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
#endif
void blake2b_hash(void *output, const void *input)
#if defined(BLAKE2B_8WAY)
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint8_t _ALIGN(A) hash[32];
sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
sph_blake2b_init(&ctx, 32, NULL, 0);
sph_blake2b_update(&ctx, input, 80);
sph_blake2b_final(&ctx, hash);
uint32_t n = first_nonce;
memcpy(output, hash, 32);
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
#elif defined(BLAKE2B_4WAY)
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
uint32_t _ALIGN(A) vhashcpu[8];
uint32_t _ALIGN(A) endiandata[20];
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
be32enc(&endiandata[19], n);
blake2b_hash(vhashcpu, endiandata);
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
pdata[19] = n;
submit_solution( work, vhashcpu, mythr );
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#else
#include "algo/blake/sph_blake2b.h"
void blake2b_hash(void *output, const void *input)
{
uint8_t _ALIGN(32) hash[32];
sph_blake2b_ctx ctx __attribute__ ((aligned (32)));
sph_blake2b_init(&ctx, 32, NULL, 0);
sph_blake2b_update(&ctx, input, 80);
sph_blake2b_final(&ctx, hash);
memcpy(output, hash, 32);
}
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(32) hash64[8];
uint32_t _ALIGN(32) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
do {
endiandata[19] = n;
blake2b_hash( hash64, endiandata );
if ( unlikely( valid_hash( hash64, ptarget ) ) && !opt_benchmark )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
return 0;
}
#endif
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_8WAY)
gate->scanhash = (void*)&scanhash_blake2b_8way;
#elif defined(BLAKE2B_4WAY)
gate->scanhash = (void*)&scanhash_blake2b_4way;
gate->hash = (void*)&blake2b_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
gate->optimizations = AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,170 +0,0 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
#include <string.h>
#include <stdint.h>
#if defined(BLAKE2S_16WAY)
static __thread blake2s_16way_state blake2s_16w_ctx;
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
uint32_t hash[8*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
pdata[19] = n;
blake2s_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_16x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -1,23 +0,0 @@
#include "blake2s-gate.h"
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,46 +0,0 @@
#ifndef __BLAKE2S_GATE_H__
#define __BLAKE2S_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
#if defined(__AVX2__)
#define BLAKE2S_8WAY
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2S_16WAY
#endif
bool register_blake2s_algo( algo_gate_t* gate );
#if defined(BLAKE2S_16WAY)
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void blake2s_hash( void *state, const void *input );
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -11,7 +11,7 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include "blake2s-hash-4way.h"
#include "blake2s-hash.h"
#include <stdint.h>
#include <string.h>
@@ -62,23 +62,23 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
memset( S, 0, sizeof( blake2s_4way_state ) );
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v128_64( 0x6A09E6676A09E667ULL );
S->h[1] = v128_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v128_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v128_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v128_64( 0x510E527F510E527FULL );
S->h[5] = v128_64( 0x9B05688C9B05688CULL );
S->h[6] = v128_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v128_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
// S->h[i] = v128_32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
return 0;
}
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
v128_64( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
v128_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
v128_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
v128_64( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
@@ -269,35 +269,35 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
v[ 8] = v256_64( 0x6A09E6676A09E667ULL );
v[ 9] = v256_64( 0xBB67AE85BB67AE85ULL );
v[10] = v256_64( 0x3C6EF3723C6EF372ULL );
v[11] = v256_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
v256_64( 0x510E527F510E527FULL ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
v256_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
v256_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
v256_64( 0x5BE0CD195BE0CD19ULL ) );
/*
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi32( blake2s_IV[7] ) );
v[ 8] = v256_32( blake2s_IV[0] );
v[ 9] = v256_32( blake2s_IV[1] );
v[10] = v256_32( blake2s_IV[2] );
v[11] = v256_32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
v256_32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
v256_32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
v256_32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
v256_32( blake2s_IV[7] ) );
#define G8W(r,i,a,b,c,d) \
@@ -391,24 +391,24 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v256_64( 0x6A09E6676A09E667ULL );
S->h[1] = v256_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v256_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v256_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v256_64( 0x510E527F510E527FULL );
S->h[5] = v256_64( 0x9B05688C9B05688CULL );
S->h[6] = v256_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v256_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
// S->h[i] = v256_32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
S->h[i] = _mm256_xor_si256( S->h[i], v256_32( p[i] ) );
return 0;
}
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
memcpy_512( m, block, 16 );
memcpy_512( v, S->h, 8 );
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
v[ 8] = v512_64( 0x6A09E6676A09E667ULL );
v[ 9] = v512_64( 0xBB67AE85BB67AE85ULL );
v[10] = v512_64( 0x3C6EF3723C6EF372ULL );
v[11] = v512_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( v512_32( S->t[0] ),
v512_64( 0x510E527F510E527FULL ) );
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
v[13] = _mm512_xor_si512( v512_32( S->t[1] ),
v512_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
v[14] = _mm512_xor_si512( v512_32( S->f[0] ),
v512_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
v[15] = _mm512_xor_si512( v512_32( S->f[1] ),
v512_64( 0x5BE0CD195BE0CD19ULL ) );
#define G16W( sigma0, sigma1, a, b, c, d) \
@@ -589,20 +589,20 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_16way_state ) );
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v512_64( 0x6A09E6676A09E667ULL );
S->h[1] = v512_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v512_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v512_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v512_64( 0x510E527F510E527FULL );
S->h[5] = v512_64( 0x9B05688C9B05688CULL );
S->h[6] = v512_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v512_64( 0x5BE0CD195BE0CD19ULL );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
S->h[i] = _mm512_xor_si512( S->h[i], v512_32( p[i] ) );
return 0;
}

View File

@@ -1,75 +1,252 @@
#include "blake2s-gate.h"
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
#include "algo-gate-api.h"
#include "blake2s-hash.h"
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY
#elif defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
#if defined(BLAKE2S_16WAY)
static __thread blake2s_16way_state blake2s_16w_ctx;
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
uint32_t hash[8*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
pdata[19] = n;
blake2s_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_16x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#else
#include "sph-blake2s.h"
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (64)));
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
int scanhash_blake2s( struct work *work,uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
s_ctx.buflen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) endiandata[20];
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do
{
endiandata[19] = n;
blake2s_hash( hash32, endiandata );
if ( unlikely( valid_hash( hash32, ptarget ) ) && !opt_benchmark )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash32, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
*/
int scanhash_blake2s( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do {
be32enc(&endiandata[19], n);
blake2s_hash( hash64, endiandata );
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
#elif defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,14 +1,12 @@
#if defined (__AVX2__)
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include "blake-hash-4way.h"
#include "blake512-hash.h"
// Blake-512 common
/*
static const uint64_t IV512[8] =
static const uint64_t BLAKE512_IV[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
@@ -16,6 +14,7 @@ static const uint64_t IV512[8] =
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
/*
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
static const unsigned sigma[16][16] = {
@@ -36,8 +35,10 @@ static const unsigned sigma[16][16] = {
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
static const uint64_t CB[16] = {
*/
/*
static const uint64_t CB[16] __attribute__ ((aligned (32))) =
{
0x243F6A8885A308D3, 0x13198A2E03707344,
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
0x452821E638D01377, 0xBE5466CF34E90C6C,
@@ -46,7 +47,7 @@ static const uint64_t CB[16] = {
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
0x0801F2E2858EFC16, 0x636920D871574E69
}
*/
#define Z00 0
@@ -244,6 +245,290 @@ static const uint64_t CB[16] = {
#define CBE 0x0801F2E2858EFC16
#define CBF 0x636920D871574E69
// Blake-512 1 way AVX2 & SSE2
#if defined(__AVX2__)
#define BLAKE512_ROUND( r ) \
{ \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, 7 ) ^ Mx( r, 6 ), \
CBx( r, 5 ) ^ Mx( r, 4 ), \
CBx( r, 3 ) ^ Mx( r, 2 ), \
CBx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, 6 ) ^ Mx( r, 7 ), \
CBx( r, 4 ) ^ Mx( r, 5 ), \
CBx( r, 2 ) ^ Mx( r, 3 ), \
CBx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
V0 = mm256_shufll_64( V0 ); \
V3 = mm256_swap_128( V3 ); \
V2 = mm256_shuflr_64( V2 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, D ) ^ Mx( r, C ), \
CBx( r, B ) ^ Mx( r, A ), \
CBx( r, 9 ) ^ Mx( r, 8 ), \
CBx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, C ) ^ Mx( r, D ), \
CBx( r, A ) ^ Mx( r, B ), \
CBx( r, 8 ) ^ Mx( r, 9 ), \
CBx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
V0 = mm256_shuflr_64( V0 ); \
V3 = mm256_swap_128( V3 ); \
V2 = mm256_shufll_64( V2 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
const uint64_t T1 )
{
__m256i V0, V1, V2, V3;
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m256i( H, 0 );
V1 = casti_m256i( H, 1 );
V2 = _mm256_set_epi64x( CB3, CB2, CB1, CB0 );
V3 = _mm256_set_epi64x( T1 ^ CB7, T1 ^ CB6, T0 ^ CB5, T0 ^ CB4 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
M2 = bswap_64( buf[ 2] );
M3 = bswap_64( buf[ 3] );
M4 = bswap_64( buf[ 4] );
M5 = bswap_64( buf[ 5] );
M6 = bswap_64( buf[ 6] );
M7 = bswap_64( buf[ 7] );
M8 = bswap_64( buf[ 8] );
M9 = bswap_64( buf[ 9] );
MA = bswap_64( buf[10] );
MB = bswap_64( buf[11] );
MC = bswap_64( buf[12] );
MD = bswap_64( buf[13] );
ME = bswap_64( buf[14] );
MF = bswap_64( buf[15] );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
BLAKE512_ROUND( 6 );
BLAKE512_ROUND( 7 );
BLAKE512_ROUND( 8 );
BLAKE512_ROUND( 9 );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m256i( H, 0 ) = mm256_xor3( casti_m256i( H, 0 ), V0, V2 );
casti_m256i( H, 1 ) = mm256_xor3( casti_m256i( H, 1 ), V1, V3 );
}
#else
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
}
#define BLAKE512_ROUND( R ) \
{ \
__m128i V32, V23, V67, V76; \
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
V[2] = mm128_alignr_64( V32, V23, 1 ); \
V[3] = mm128_alignr_64( V23, V32, 1 ); \
V[6] = mm128_alignr_64( V76, V67, 1 ); \
V[7] = mm128_alignr_64( V67, V76, 1 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 )
{
__m128i V[8];
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V[0] = casti_m128i( H, 0 );
V[1] = casti_m128i( H, 1 );
V[2] = casti_m128i( H, 2 );
V[3] = casti_m128i( H, 3 );
V[4] = _mm_set_epi64x( CB1, CB0 );
V[5] = _mm_set_epi64x( CB3, CB2 );
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
M2 = bswap_64( buf[ 2] );
M3 = bswap_64( buf[ 3] );
M4 = bswap_64( buf[ 4] );
M5 = bswap_64( buf[ 5] );
M6 = bswap_64( buf[ 6] );
M7 = bswap_64( buf[ 7] );
M8 = bswap_64( buf[ 8] );
M9 = bswap_64( buf[ 9] );
MA = bswap_64( buf[10] );
MB = bswap_64( buf[11] );
MC = bswap_64( buf[12] );
MD = bswap_64( buf[13] );
ME = bswap_64( buf[14] );
MF = bswap_64( buf[15] );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
BLAKE512_ROUND( 6 );
BLAKE512_ROUND( 7 );
BLAKE512_ROUND( 8 );
BLAKE512_ROUND( 9 );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
}
#endif
void blake512_init( blake512_context *sc )
{
memcpy( sc->H, BLAKE512_IV, 8 * sizeof(uint64_t) );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
void blake512_update(blake512_context *sc, const void *data, size_t len)
{
if ( len < (sizeof sc->buf) - sc->ptr )
{
memcpy( sc->buf + sc->ptr, data, len );
sc->ptr += len;
return;
}
while ( len > 0 )
{
size_t clen;
clen = (sizeof sc->buf) - sc->ptr;
if ( clen > len ) clen = len;
memcpy( sc->buf + sc->ptr, data, clen );
sc->ptr += clen;
data = (const unsigned char *)data + clen;
len -= clen;
if ( sc->ptr == sizeof sc->buf )
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 += 1;
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
sc->ptr = 0;
}
}
}
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr, k;
unsigned bit_len;
uint64_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00;
sc->T1 = 0xFFFFFFFFFFFFFFFF;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00 + bit_len;
sc->T1 -= 1;
}
else
sc->T0 -= 1024 - bit_len;
if ( bit_len <= 894 )
{
memset( buf + ptr + 1, 0, 111 - ptr );
buf[111] |= 1;
*((uint64_t*)(buf + 112)) = bswap_64( th );
*((uint64_t*)(buf + 120)) = bswap_64( tl );
blake512_update( sc, buf + ptr, 128 - ptr );
}
else
{
memset( buf + ptr + 1, 0, 127 - ptr );
blake512_update( sc, buf + ptr, 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00;
sc->T1 = 0xFFFFFFFFFFFFFFFF;
memset( buf, 0, 112 );
buf[111] = 1;
*(uint64_t*)(buf + 112) = bswap_64( th );
*(uint64_t*)(buf + 120) = bswap_64( tl );
blake512_update( sc, buf, 128 );
}
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,
size_t len )
{
blake512_init( sc );
blake512_update( sc, data, len );
blake512_close( sc, dst );
}
#if defined(__AVX2__)
#define READ_STATE64(state) do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
@@ -277,12 +562,12 @@ static const uint64_t CB[16] = {
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
v512_64( c1 ), m0 ), b ), a ); \
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
v512_64( c0 ), m1 ), b ), a ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
@@ -318,14 +603,14 @@ static const uint64_t CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm512_set1_epi64( CB0 ); \
V9 = _mm512_set1_epi64( CB1 ); \
VA = _mm512_set1_epi64( CB2 ); \
VB = _mm512_set1_epi64( CB3 ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
V8 = v512_64( CB0 ); \
V9 = v512_64( CB1 ); \
VA = v512_64( CB2 ); \
VB = v512_64( CB3 ); \
VC = v512_64( T0 ^ CB4 ); \
VD = v512_64( T0 ^ CB5 ); \
VE = v512_64( T1 ^ CB6 ); \
VF = v512_64( T1 ^ CB7 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -385,14 +670,14 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( sc->T0 ^ CB4 );
VD = v512_64( sc->T0 ^ CB5 );
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
@@ -457,14 +742,14 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( sc->T0 ^ CB4 );
VD = v512_64( sc->T0 ^ CB5 );
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
@@ -519,23 +804,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
sc->buf[10] = v512_64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[13] = v512_64( 1 );
sc->buf[14] = m512_zero;
sc->buf[15] = _mm512_set1_epi64( 80*8 );
sc->buf[15] = v512_64( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -546,14 +831,14 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
VF = _mm512_set1_epi64( CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( CB4 ^ 0x280ULL );
VD = v512_64( CB5 ^ 0x280ULL );
VE = v512_64( CB6 );
VF = v512_64( CB7 );
// round 0
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
@@ -565,7 +850,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
v512_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
@@ -578,7 +863,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// round 1
// G1
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( CB8 ),
sc->buf[ 4] ) );
// G2
@@ -588,7 +873,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// G3
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
v512_64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
@@ -656,7 +941,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// finish round 0 with the nonce now available
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
_mm512_set1_epi64( CB8 ), M9 ) );
v512_64( CB8 ), M9 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
@@ -667,14 +952,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// G1
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
v512_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
@@ -683,12 +968,12 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
v512_64( CBF ), M9 ) );
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
v512_64( CB9 ), MF ), V6 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
@@ -696,13 +981,13 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// G3
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
// v512_64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
v512_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
@@ -744,14 +1029,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -815,7 +1100,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
buf[ptr>>3] = v512_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -836,9 +1121,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
{
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm512_or_si512( buf[104>>3],
_mm512_set1_epi64( 0x0100000000000000ULL ) );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
v512_64( 0x0100000000000000ULL ) );
buf[112>>3] = v512_64( bswap_64( th ) );
buf[120>>3] = v512_64( bswap_64( tl ) );
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -850,9 +1135,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_512( buf, 112>>3 );
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
buf[104>>3] = v512_64( 0x0100000000000000ULL );
buf[112>>3] = v512_64( bswap_64( th ) );
buf[120>>3] = v512_64( bswap_64( tl ) );
blake64_8way( sc, buf, 128 );
}
@@ -866,14 +1151,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -897,7 +1182,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
sc->buf[ptr64] = v512_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -915,9 +1200,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
sc->buf[13] = v512_64( 0x0100000000000000ULL );
sc->buf[14] = v512_64( bswap_64( th ) );
sc->buf[15] = v512_64( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -933,14 +1218,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -964,7 +1249,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
sc->buf[ptr64] = v512_64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -982,9 +1267,9 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = _mm512_set1_epi64( th );
sc->buf[15] = _mm512_set1_epi64( tl );
sc->buf[13] = v512_64( 1 );
sc->buf[14] = v512_64( th );
sc->buf[15] = v512_64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -1013,12 +1298,12 @@ blake512_8way_close(void *cc, void *dst)
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
{ \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
v256_64( c1 ), m0 ), b ), a ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
v256_64( c0 ), m1 ), b ), a ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -1054,14 +1339,14 @@ blake512_8way_close(void *cc, void *dst)
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_set1_epi64x( CB0 ); \
V9 = _mm256_set1_epi64x( CB1 ); \
VA = _mm256_set1_epi64x( CB2 ); \
VB = _mm256_set1_epi64x( CB3 ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
V8 = v256_64( CB0 ); \
V9 = v256_64( CB1 ); \
VA = v256_64( CB2 ); \
VB = v256_64( CB3 ); \
VC = v256_64( T0 ^ CB4 ); \
VD = v256_64( T0 ^ CB5 ); \
VE = v256_64( T1 ^ CB6 ); \
VF = v256_64( T1 ^ CB7 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -1122,18 +1407,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
_mm256_set1_epi64x( CB4 ) );
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
_mm256_set1_epi64x( CB5 ) );
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
_mm256_set1_epi64x( CB6 ) );
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
_mm256_set1_epi64x( CB7 ) );
V8 = v256_64( CB0 );
V9 = v256_64( CB1 );
VA = v256_64( CB2 );
VB = v256_64( CB3 );
VC = _mm256_xor_si256( v256_64( sc->T0 ),
v256_64( CB4 ) );
VD = _mm256_xor_si256( v256_64( sc->T0 ),
v256_64( CB5 ) );
VE = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB6 ) );
VF = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB7 ) );
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
@@ -1188,23 +1473,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
sc->buf[10] = v256_64( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[13] = v256_64( 1 );
sc->buf[14] = m256_zero;
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
sc->buf[15] = v256_64( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -1215,14 +1500,14 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
VF = _mm256_set1_epi64x( CB7 );
V8 = v256_64( CB0 );
V9 = v256_64( CB1 );
VA = v256_64( CB2 );
VB = v256_64( CB3 );
VC = v256_64( CB4 ^ 0x280ULL );
VD = v256_64( CB5 ^ 0x280ULL );
VE = v256_64( CB6 );
VF = v256_64( CB7 );
// round 0
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
@@ -1232,7 +1517,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
v256_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
@@ -1244,7 +1529,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// round 1
// G1
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( v256_64( CB8 ),
sc->buf[ 4] ) );
// G2
@@ -1252,7 +1537,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// G3
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
v256_64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
@@ -1319,7 +1604,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
v256_64( CB8 ), M9 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
@@ -1334,19 +1619,19 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
v256_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
v256_64( CBF ), M9 ) );
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
v256_64( CB9 ), MF ), V6 ) );
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
@@ -1356,7 +1641,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
v256_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
@@ -1398,14 +1683,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1465,7 +1750,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
buf[ptr>>3] = v256_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -1487,9 +1772,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
{
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm256_or_si256( buf[104>>3],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
v256_64( 0x0100000000000000ULL ) );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -1501,9 +1786,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_256( buf, 112>>3 );
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
buf[104>>3] = v256_64( 0x0100000000000000ULL );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}
@@ -1517,14 +1802,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
// init
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1548,7 +1833,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
sc->buf[ptr64] = v256_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( sc->ptr == 0 )
@@ -1565,9 +1850,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
sc->buf[13] = v256_64( 0x0100000000000000ULL );
sc->buf[14] = v256_64( bswap_64( th ) );
sc->buf[15] = v256_64( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -1589,4 +1874,4 @@ blake512_4way_close(void *cc, void *dst)
blake64_4way_close( cc, dst );
}
#endif
#endif // AVX2

View File

@@ -0,0 +1,83 @@
#ifndef BLAKE512_HASH__
#define BLAKE512_HASH__ 1
#include <stddef.h>
#include "simd-utils.h"
/////////////////////////
//
// Blake-512 1 way SSE2 & AVX2
typedef struct {
unsigned char buf[128]; /* first field, for alignment */
uint64_t H[8];
uint64_t T0, T1;
size_t ptr;
} blake512_context __attribute__ ((aligned (32)));
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
void blake512_init( blake512_context *sc );
void blake512_update( blake512_context *sc, const void *data, size_t len );
void blake512_close( blake512_context *sc, void *dst );
void blake512_full( blake512_context *sc, void *dst, const void *data,
size_t len );
#ifdef __AVX2__
// Blake-512 4 way AVX2
typedef struct {
__m256i buf[16];
__m256i H[8];
__m256i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_4way_big_context __attribute__ ((aligned (64)));
typedef blake_4way_big_context blake512_4way_context;
void blake512_4way_init( blake_4way_big_context *sc );
void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////
//
//// Blake-512 8 way AVX512
typedef struct {
__m512i buf[16];
__m512i H[8];
__m512i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));
typedef blake_8way_big_context blake512_8way_context;
void blake512_8way_init( blake_8way_big_context *sc );
void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2
#endif // BLAKE512_HASH_H__

View File

@@ -1,5 +1,5 @@
#include "blakecoin-gate.h"
#include "blake-hash-4way.h"
#include "blake256-hash.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
@@ -30,25 +30,25 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
block0_hash[0] = v512_32( phash[0] );
block0_hash[1] = v512_32( phash[1] );
block0_hash[2] = v512_32( phash[2] );
block0_hash[3] = v512_32( phash[3] );
block0_hash[4] = v512_32( phash[4] );
block0_hash[5] = v512_32( phash[5] );
block0_hash[6] = v512_32( phash[6] );
block0_hash[7] = v512_32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm512_set1_epi32( pdata[16] );
block_buf[1] = _mm512_set1_epi32( pdata[17] );
block_buf[2] = _mm512_set1_epi32( pdata[18] );
block_buf[0] = v512_32( pdata[16] );
block_buf[1] = v512_32( pdata[17] );
block_buf[2] = v512_32( pdata[18] );
block_buf[3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
@@ -101,25 +101,25 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 8;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i eight = v256_32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
block0_hash[0] = v256_32( phash[0] );
block0_hash[1] = v256_32( phash[1] );
block0_hash[2] = v256_32( phash[2] );
block0_hash[3] = v256_32( phash[3] );
block0_hash[4] = v256_32( phash[4] );
block0_hash[5] = v256_32( phash[5] );
block0_hash[6] = v256_32( phash[6] );
block0_hash[7] = v256_32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[0] = v256_32( pdata[16] );
block_buf[1] = v256_32( pdata[17] );
block_buf[2] = v256_32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
@@ -203,142 +203,3 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
#endif
#if 0
//#if defined(BLAKECOIN_8WAY)
blake256r8_8way_context blakecoin_8w_ctx;
void blakecoin_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r8_8way_context ctx;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
blake256r8_8way_close( &ctx, vhash );
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
state+160, state+192, state+224, vhash, 256 );
}
/*
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = (uint32_t*)work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, 8 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
HTarget = 0x7f;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256r8_8way_init( &blakecoin_8w_ctx );
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
blake256r8_8way_close( &ctx, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= HTarget )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -6,9 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake-hash-4way.h"
#include "sph_blake.h"
#include "blake512-hash.h"
extern void pentablakehash_4way( void *output, const void *input )
{

View File

@@ -14,8 +14,9 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "simd-utils.h"
#include "compat/sph_types.h"
#include "compat.h"
#include "sph-blake2s.h"
static const uint32_t blake2s_IV[8] =
@@ -208,8 +209,8 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
uint32_t m[16];
uint32_t v[16];
uint32_t _ALIGN(32) m[16];
uint32_t _ALIGN(32) v[16];
for( size_t i = 0; i < 16; ++i )
m[i] = load32( block + i * sizeof( m[i] ) );
@@ -225,6 +226,58 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__)
__m128i *V = (__m128i*)v;
#define BLAKE2S_ROUND( r ) \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[0] = mm128_shufll_32( V[0] ); \
V[3] = mm128_swap_64( V[3] ); \
V[2] = mm128_shuflr_32( V[2] ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[0] = mm128_shuflr_32( V[0] ); \
V[3] = mm128_swap_64( V[3] ); \
V[2] = mm128_shufll_32( V[2] )
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
#undef BLAKE2S_ROUND
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -236,6 +289,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
c = c + d; \
b = SPH_ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
@@ -247,7 +301,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
@@ -258,6 +313,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
ROUND( 8 );
ROUND( 9 );
#endif
for( size_t i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];