Compare commits

...

1 Commits

Author SHA1 Message Date
Jay D Dee
bc5a5c6df8 v3.23.3 2023-09-28 18:43:18 -04:00
88 changed files with 5526 additions and 3361 deletions

View File

@@ -36,21 +36,17 @@ cpuminer_SOURCES = \
algo/argon2/argon2d/argon2d/argon2d_thread.c \
algo/argon2/argon2d/argon2d/encoding.c \
algo/blake/sph_blake.c \
algo/blake/blake256-hash-4way.c \
algo/blake/blake512-hash-4way.c \
algo/blake/blake256-hash.c \
algo/blake/blake512-hash.c \
algo/blake/blake-gate.c \
algo/blake/blake.c \
algo/blake/blake-4way.c \
algo/blake/sph_blake2b.c \
algo/blake/sph-blake2s.c \
algo/blake/blake2s-hash-4way.c \
algo/blake/blake2s-hash.c \
algo/blake/blake2s.c \
algo/blake/blake2s-gate.c \
algo/blake/blake2s-4way.c \
algo/blake/blake2b-hash-4way.c \
algo/blake/blake2b-hash.c \
algo/blake/blake2b.c \
algo/blake/blake2b-gate.c \
algo/blake/blake2b-4way.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \

View File

@@ -65,6 +65,14 @@ If not what makes it happen or not happen?
Change Log
----------
v3.23.3
#400: Removed excessive thread restarts when mining solo.
Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS.
Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension.
Added prototype of sha-512 using SHA512 intrinsics, untested.
Other improvements and code cleanup.
v3.23.2
sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.

View File

@@ -97,6 +97,7 @@ typedef uint32_t set_t;
#define SHA_OPT 0x20 // Zen1, Icelake (deprecated)
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define VAES_OPT 0x80 // Icelake, Zen3
#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES

View File

@@ -1,5 +1,5 @@
#include "blake-gate.h"
#include "blake-hash-4way.h"
#include "blake256-hash.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>

View File

@@ -1,5 +1,5 @@
#ifndef BLAKE_HASH_4WAY__
#define BLAKE_HASH_4WAY__ 1
#ifndef BLAKE256_HASH__
#define BLAKE256_HASH__ 1
#include <stddef.h>
#include "simd-utils.h"
@@ -11,13 +11,6 @@
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds );
/////////////////////////
//
// Blake-512 1 way SSE2
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
//////////////////////////
//
// Blake-256 4 way SSE2
@@ -86,30 +79,6 @@ void blake256r8_8way_init(void *cc);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way AVX2
typedef struct {
__m256i buf[16];
__m256i H[8];
__m256i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_4way_big_context __attribute__ ((aligned (128)));
typedef blake_4way_big_context blake512_4way_context;
void blake512_4way_init( blake_4way_big_context *sc );
void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////
@@ -137,7 +106,6 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
void blake256r14_16way_init(void *cc);
@@ -150,33 +118,7 @@ void blake256r8_16way_init(void *cc);
void blake256r8_16way_update(void *cc, const void *data, size_t len);
void blake256r8_16way_close(void *cc, void *dst);
////////////////////////////
//
//// Blake-512 8 way AVX512
typedef struct {
__m512i buf[16];
__m512i H[8];
__m512i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));
typedef blake_8way_big_context blake512_8way_context;
void blake512_8way_init( blake_8way_big_context *sc );
void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2
#endif // BLAKE_HASH_4WAY_H__
#endif // BLAKE256_HASH_H__

View File

@@ -1,113 +0,0 @@
/**
* Blake2-B Implementation
* tpruvot@github 2015-2016
*/
#include "blake2b-gate.h"
#include <string.h>
#include <stdint.h>
#include "blake2b-hash-4way.h"
#if defined(BLAKE2B_8WAY)
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2B_4WAY)
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -1,20 +0,0 @@
#include "blake2b-gate.h"
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_8WAY)
gate->scanhash = (void*)&scanhash_blake2b_8way;
// gate->hash = (void*)&blake2b_8way_hash;
#elif defined(BLAKE2B_4WAY)
gate->scanhash = (void*)&scanhash_blake2b_4way;
gate->hash = (void*)&blake2b_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
gate->optimizations = AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,34 +0,0 @@
#ifndef __BLAKE2B_GATE_H__
#define __BLAKE2B_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
#endif
bool register_blake2b_algo( algo_gate_t* gate );
#if defined(BLAKE2B_8WAY)
//void blake2b_8way_hash( void *state, const void *input );
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BLAKE2B_4WAY)
void blake2b_4way_hash( void *state, const void *input );
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void blake2b_hash( void *state, const void *input );
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -31,7 +31,7 @@
#include <stdint.h>
#include <string.h>
#include "blake2b-hash-4way.h"
#include "blake2b-hash.h"
#if defined(__AVX2__)
@@ -252,17 +252,17 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
v[ 8] = v512_64( 0x6A09E667F3BCC908 );
v[ 9] = v512_64( 0xBB67AE8584CAA73B );
v[10] = v512_64( 0x3C6EF372FE94F82B );
v[11] = v512_64( 0xA54FF53A5F1D36F1 );
v[12] = v512_64( 0x510E527FADE682D1 );
v[13] = v512_64( 0x9B05688C2B3E6C1F );
v[14] = v512_64( 0x1F83D9ABFB41BD6B );
v[15] = v512_64( 0x5BE0CD19137E2179 );
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
v[12] = _mm512_xor_si512( v[12], v512_64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], v512_64( ctx->t[1] ) );
if ( last )
v[14] = mm512_not( v[14] );
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
{
size_t i;
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
ctx->h[0] = v512_64( 0x6A09E667F3BCC908 );
ctx->h[1] = v512_64( 0xBB67AE8584CAA73B );
ctx->h[2] = v512_64( 0x3C6EF372FE94F82B );
ctx->h[3] = v512_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = v512_64( 0x510E527FADE682D1 );
ctx->h[5] = v512_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = v512_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = v512_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], v512_64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;
@@ -419,17 +419,17 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
v[ 8] = v256_64( 0x6A09E667F3BCC908 );
v[ 9] = v256_64( 0xBB67AE8584CAA73B );
v[10] = v256_64( 0x3C6EF372FE94F82B );
v[11] = v256_64( 0xA54FF53A5F1D36F1 );
v[12] = v256_64( 0x510E527FADE682D1 );
v[13] = v256_64( 0x9B05688C2B3E6C1F );
v[14] = v256_64( 0x1F83D9ABFB41BD6B );
v[15] = v256_64( 0x5BE0CD19137E2179 );
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
v[12] = _mm256_xor_si256( v[12], v256_64( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], v256_64( ctx->t[1] ) );
if ( last )
v[14] = mm256_not( v[14] );
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
{
size_t i;
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
ctx->h[0] = v256_64( 0x6A09E667F3BCC908 );
ctx->h[1] = v256_64( 0xBB67AE8584CAA73B );
ctx->h[2] = v256_64( 0x3C6EF372FE94F82B );
ctx->h[3] = v256_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = v256_64( 0x510E527FADE682D1 );
ctx->h[5] = v256_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = v256_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = v256_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], v256_64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;

View File

@@ -1,64 +1,175 @@
/**
* Blake2-B Implementation
* tpruvot@github 2015-2016
*/
#include "blake2b-gate.h"
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
#include "algo-gate-api.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
#include "blake2b-hash.h"
#define MIDLEN 76
#define A 64
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
#endif
void blake2b_hash(void *output, const void *input)
#if defined(BLAKE2B_8WAY)
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint8_t _ALIGN(A) hash[32];
sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
sph_blake2b_init(&ctx, 32, NULL, 0);
sph_blake2b_update(&ctx, input, 80);
sph_blake2b_final(&ctx, hash);
uint32_t n = first_nonce;
memcpy(output, hash, 32);
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
#elif defined(BLAKE2B_4WAY)
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
uint32_t _ALIGN(A) vhashcpu[8];
uint32_t _ALIGN(A) endiandata[20];
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
be32enc(&endiandata[19], n);
blake2b_hash(vhashcpu, endiandata);
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
{
pdata[19] = n;
submit_solution( work, vhashcpu, mythr );
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}
#else
#include "algo/blake/sph_blake2b.h"
void blake2b_hash(void *output, const void *input)
{
uint8_t _ALIGN(32) hash[32];
sph_blake2b_ctx ctx __attribute__ ((aligned (32)));
sph_blake2b_init(&ctx, 32, NULL, 0);
sph_blake2b_update(&ctx, input, 80);
sph_blake2b_final(&ctx, hash);
memcpy(output, hash, 32);
}
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(32) hash64[8];
uint32_t _ALIGN(32) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
do {
endiandata[19] = n;
blake2b_hash( hash64, endiandata );
if ( unlikely( valid_hash( hash64, ptarget ) ) && !opt_benchmark )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
return 0;
}
#endif
bool register_blake2b_algo( algo_gate_t* gate )
{
#if defined(BLAKE2B_8WAY)
gate->scanhash = (void*)&scanhash_blake2b_8way;
#elif defined(BLAKE2B_4WAY)
gate->scanhash = (void*)&scanhash_blake2b_4way;
gate->hash = (void*)&blake2b_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2b;
gate->hash = (void*)&blake2b_hash;
#endif
gate->optimizations = AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,170 +0,0 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
#include <string.h>
#include <stdint.h>
#if defined(BLAKE2S_16WAY)
static __thread blake2s_16way_state blake2s_16w_ctx;
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
uint32_t hash[8*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
pdata[19] = n;
blake2s_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_16x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -1,23 +0,0 @@
#include "blake2s-gate.h"
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,46 +0,0 @@
#ifndef __BLAKE2S_GATE_H__
#define __BLAKE2S_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
#if defined(__AVX2__)
#define BLAKE2S_8WAY
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2S_16WAY
#endif
bool register_blake2s_algo( algo_gate_t* gate );
#if defined(BLAKE2S_16WAY)
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void blake2s_hash( void *state, const void *input );
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -11,7 +11,7 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include "blake2s-hash-4way.h"
#include "blake2s-hash.h"
#include <stdint.h>
#include <string.h>
@@ -62,23 +62,23 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
memset( S, 0, sizeof( blake2s_4way_state ) );
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v128_64( 0x6A09E6676A09E667ULL );
S->h[1] = v128_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v128_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v128_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v128_64( 0x510E527F510E527FULL );
S->h[5] = v128_64( 0x9B05688C9B05688CULL );
S->h[6] = v128_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v128_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
// S->h[i] = v128_32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
return 0;
}
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
v128_64( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
v128_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
v128_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
v128_64( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
@@ -269,35 +269,35 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
v[ 8] = v256_64( 0x6A09E6676A09E667ULL );
v[ 9] = v256_64( 0xBB67AE85BB67AE85ULL );
v[10] = v256_64( 0x3C6EF3723C6EF372ULL );
v[11] = v256_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
v256_64( 0x510E527F510E527FULL ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
v256_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
v256_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
v256_64( 0x5BE0CD195BE0CD19ULL ) );
/*
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi32( blake2s_IV[7] ) );
v[ 8] = v256_32( blake2s_IV[0] );
v[ 9] = v256_32( blake2s_IV[1] );
v[10] = v256_32( blake2s_IV[2] );
v[11] = v256_32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
v256_32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
v256_32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
v256_32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
v256_32( blake2s_IV[7] ) );
#define G8W(r,i,a,b,c,d) \
@@ -391,24 +391,24 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v256_64( 0x6A09E6676A09E667ULL );
S->h[1] = v256_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v256_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v256_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v256_64( 0x510E527F510E527FULL );
S->h[5] = v256_64( 0x9B05688C9B05688CULL );
S->h[6] = v256_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v256_64( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
// S->h[i] = v256_32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
S->h[i] = _mm256_xor_si256( S->h[i], v256_32( p[i] ) );
return 0;
}
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
memcpy_512( m, block, 16 );
memcpy_512( v, S->h, 8 );
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
v[ 8] = v512_64( 0x6A09E6676A09E667ULL );
v[ 9] = v512_64( 0xBB67AE85BB67AE85ULL );
v[10] = v512_64( 0x3C6EF3723C6EF372ULL );
v[11] = v512_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( v512_32( S->t[0] ),
v512_64( 0x510E527F510E527FULL ) );
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
v[13] = _mm512_xor_si512( v512_32( S->t[1] ),
v512_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
v[14] = _mm512_xor_si512( v512_32( S->f[0] ),
v512_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
v[15] = _mm512_xor_si512( v512_32( S->f[1] ),
v512_64( 0x5BE0CD195BE0CD19ULL ) );
#define G16W( sigma0, sigma1, a, b, c, d) \
@@ -589,20 +589,20 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_16way_state ) );
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = v512_64( 0x6A09E6676A09E667ULL );
S->h[1] = v512_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = v512_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = v512_64( 0xA54FF53AA54FF53AULL );
S->h[4] = v512_64( 0x510E527F510E527FULL );
S->h[5] = v512_64( 0x9B05688C9B05688CULL );
S->h[6] = v512_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = v512_64( 0x5BE0CD195BE0CD19ULL );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
S->h[i] = _mm512_xor_si512( S->h[i], v512_32( p[i] ) );
return 0;
}

View File

@@ -1,75 +1,252 @@
#include "blake2s-gate.h"
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
#include "algo-gate-api.h"
#include "blake2s-hash.h"
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY
#elif defined(__SSE2__)
#define BLAKE2S_4WAY
#endif
#if defined(BLAKE2S_16WAY)
static __thread blake2s_16way_state blake2s_16w_ctx;
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
uint32_t hash[8*16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
pdata[19] = n;
blake2s_16way_hash( hash, vdata );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_16x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 16;
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 4;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#else
#include "sph-blake2s.h"
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (64)));
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
int scanhash_blake2s( struct work *work,uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
s_ctx.buflen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) endiandata[20];
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do
{
endiandata[19] = n;
blake2s_hash( hash32, endiandata );
if ( unlikely( valid_hash( hash32, ptarget ) ) && !opt_benchmark )
{
pdata[19] = bswap_32( n );
submit_solution( work, hash32, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
*/
int scanhash_blake2s( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do {
be32enc(&endiandata[19], n);
blake2s_hash( hash64, endiandata );
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
#elif defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -1,14 +1,12 @@
#if defined (__AVX2__)
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include "blake-hash-4way.h"
#include "blake512-hash.h"
// Blake-512 common
/*
static const uint64_t IV512[8] =
static const uint64_t BLAKE512_IV[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
@@ -16,6 +14,7 @@ static const uint64_t IV512[8] =
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
/*
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
static const unsigned sigma[16][16] = {
@@ -36,8 +35,10 @@ static const unsigned sigma[16][16] = {
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
static const uint64_t CB[16] = {
*/
/*
static const uint64_t CB[16] __attribute__ ((aligned (32))) =
{
0x243F6A8885A308D3, 0x13198A2E03707344,
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
0x452821E638D01377, 0xBE5466CF34E90C6C,
@@ -46,7 +47,7 @@ static const uint64_t CB[16] = {
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
0x0801F2E2858EFC16, 0x636920D871574E69
}
*/
#define Z00 0
@@ -244,6 +245,290 @@ static const uint64_t CB[16] = {
#define CBE 0x0801F2E2858EFC16
#define CBF 0x636920D871574E69
// Blake-512 1 way AVX2 & SSE2
#if defined(__AVX2__)
#define BLAKE512_ROUND( r ) \
{ \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, 7 ) ^ Mx( r, 6 ), \
CBx( r, 5 ) ^ Mx( r, 4 ), \
CBx( r, 3 ) ^ Mx( r, 2 ), \
CBx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, 6 ) ^ Mx( r, 7 ), \
CBx( r, 4 ) ^ Mx( r, 5 ), \
CBx( r, 2 ) ^ Mx( r, 3 ), \
CBx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
V0 = mm256_shufll_64( V0 ); \
V3 = mm256_swap_128( V3 ); \
V2 = mm256_shuflr_64( V2 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, D ) ^ Mx( r, C ), \
CBx( r, B ) ^ Mx( r, A ), \
CBx( r, 9 ) ^ Mx( r, 8 ), \
CBx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
_mm256_set_epi64x( CBx( r, C ) ^ Mx( r, D ), \
CBx( r, A ) ^ Mx( r, B ), \
CBx( r, 8 ) ^ Mx( r, 9 ), \
CBx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
V2 = _mm256_add_epi64( V2, V3 ); \
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
V0 = mm256_shuflr_64( V0 ); \
V3 = mm256_swap_128( V3 ); \
V2 = mm256_shufll_64( V2 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
const uint64_t T1 )
{
__m256i V0, V1, V2, V3;
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m256i( H, 0 );
V1 = casti_m256i( H, 1 );
V2 = _mm256_set_epi64x( CB3, CB2, CB1, CB0 );
V3 = _mm256_set_epi64x( T1 ^ CB7, T1 ^ CB6, T0 ^ CB5, T0 ^ CB4 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
M2 = bswap_64( buf[ 2] );
M3 = bswap_64( buf[ 3] );
M4 = bswap_64( buf[ 4] );
M5 = bswap_64( buf[ 5] );
M6 = bswap_64( buf[ 6] );
M7 = bswap_64( buf[ 7] );
M8 = bswap_64( buf[ 8] );
M9 = bswap_64( buf[ 9] );
MA = bswap_64( buf[10] );
MB = bswap_64( buf[11] );
MC = bswap_64( buf[12] );
MD = bswap_64( buf[13] );
ME = bswap_64( buf[14] );
MF = bswap_64( buf[15] );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
BLAKE512_ROUND( 6 );
BLAKE512_ROUND( 7 );
BLAKE512_ROUND( 8 );
BLAKE512_ROUND( 9 );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m256i( H, 0 ) = mm256_xor3( casti_m256i( H, 0 ), V0, V2 );
casti_m256i( H, 1 ) = mm256_xor3( casti_m256i( H, 1 ), V1, V3 );
}
#else
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
}
#define BLAKE512_ROUND( R ) \
{ \
__m128i V32, V23, V67, V76; \
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
V[2] = mm128_alignr_64( V32, V23, 1 ); \
V[3] = mm128_alignr_64( V23, V32, 1 ); \
V[6] = mm128_alignr_64( V76, V67, 1 ); \
V[7] = mm128_alignr_64( V67, V76, 1 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 )
{
__m128i V[8];
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V[0] = casti_m128i( H, 0 );
V[1] = casti_m128i( H, 1 );
V[2] = casti_m128i( H, 2 );
V[3] = casti_m128i( H, 3 );
V[4] = _mm_set_epi64x( CB1, CB0 );
V[5] = _mm_set_epi64x( CB3, CB2 );
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
M2 = bswap_64( buf[ 2] );
M3 = bswap_64( buf[ 3] );
M4 = bswap_64( buf[ 4] );
M5 = bswap_64( buf[ 5] );
M6 = bswap_64( buf[ 6] );
M7 = bswap_64( buf[ 7] );
M8 = bswap_64( buf[ 8] );
M9 = bswap_64( buf[ 9] );
MA = bswap_64( buf[10] );
MB = bswap_64( buf[11] );
MC = bswap_64( buf[12] );
MD = bswap_64( buf[13] );
ME = bswap_64( buf[14] );
MF = bswap_64( buf[15] );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
BLAKE512_ROUND( 6 );
BLAKE512_ROUND( 7 );
BLAKE512_ROUND( 8 );
BLAKE512_ROUND( 9 );
BLAKE512_ROUND( 0 );
BLAKE512_ROUND( 1 );
BLAKE512_ROUND( 2 );
BLAKE512_ROUND( 3 );
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
}
#endif
void blake512_init( blake512_context *sc )
{
memcpy( sc->H, BLAKE512_IV, 8 * sizeof(uint64_t) );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
void blake512_update(blake512_context *sc, const void *data, size_t len)
{
if ( len < (sizeof sc->buf) - sc->ptr )
{
memcpy( sc->buf + sc->ptr, data, len );
sc->ptr += len;
return;
}
while ( len > 0 )
{
size_t clen;
clen = (sizeof sc->buf) - sc->ptr;
if ( clen > len ) clen = len;
memcpy( sc->buf + sc->ptr, data, clen );
sc->ptr += clen;
data = (const unsigned char *)data + clen;
len -= clen;
if ( sc->ptr == sizeof sc->buf )
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 += 1;
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
sc->ptr = 0;
}
}
}
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr, k;
unsigned bit_len;
uint64_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr] = 0x80;
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00;
sc->T1 = 0xFFFFFFFFFFFFFFFF;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00 + bit_len;
sc->T1 -= 1;
}
else
sc->T0 -= 1024 - bit_len;
if ( bit_len <= 894 )
{
memset( buf + ptr + 1, 0, 111 - ptr );
buf[111] |= 1;
*((uint64_t*)(buf + 112)) = bswap_64( th );
*((uint64_t*)(buf + 120)) = bswap_64( tl );
blake512_update( sc, buf + ptr, 128 - ptr );
}
else
{
memset( buf + ptr + 1, 0, 127 - ptr );
blake512_update( sc, buf + ptr, 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00;
sc->T1 = 0xFFFFFFFFFFFFFFFF;
memset( buf, 0, 112 );
buf[111] = 1;
*(uint64_t*)(buf + 112) = bswap_64( th );
*(uint64_t*)(buf + 120) = bswap_64( tl );
blake512_update( sc, buf, 128 );
}
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,
size_t len )
{
blake512_init( sc );
blake512_update( sc, data, len );
blake512_close( sc, dst );
}
#if defined(__AVX2__)
#define READ_STATE64(state) do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
@@ -277,12 +562,12 @@ static const uint64_t CB[16] = {
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
v512_64( c1 ), m0 ), b ), a ); \
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
v512_64( c0 ), m1 ), b ), a ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
@@ -318,14 +603,14 @@ static const uint64_t CB[16] = {
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm512_set1_epi64( CB0 ); \
V9 = _mm512_set1_epi64( CB1 ); \
VA = _mm512_set1_epi64( CB2 ); \
VB = _mm512_set1_epi64( CB3 ); \
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
V8 = v512_64( CB0 ); \
V9 = v512_64( CB1 ); \
VA = v512_64( CB2 ); \
VB = v512_64( CB3 ); \
VC = v512_64( T0 ^ CB4 ); \
VD = v512_64( T0 ^ CB5 ); \
VE = v512_64( T1 ^ CB6 ); \
VF = v512_64( T1 ^ CB7 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -385,14 +670,14 @@ void blake512_8way_compress( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( sc->T0 ^ CB4 );
VD = v512_64( sc->T0 ^ CB5 );
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
@@ -457,14 +742,14 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( sc->T0 ^ CB4 );
VD = v512_64( sc->T0 ^ CB5 );
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
M0 = sc->buf[ 0];
M1 = sc->buf[ 1];
@@ -519,23 +804,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
sc->buf[10] = v512_64( 0x8000000000000000ULL );
sc->buf[11] =
sc->buf[12] = m512_zero;
sc->buf[13] = m512_one_64;
sc->buf[13] = v512_64( 1 );
sc->buf[14] = m512_zero;
sc->buf[15] = _mm512_set1_epi64( 80*8 );
sc->buf[15] = v512_64( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -546,14 +831,14 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm512_set1_epi64( CB0 );
V9 = _mm512_set1_epi64( CB1 );
VA = _mm512_set1_epi64( CB2 );
VB = _mm512_set1_epi64( CB3 );
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
VE = _mm512_set1_epi64( CB6 );
VF = _mm512_set1_epi64( CB7 );
V8 = v512_64( CB0 );
V9 = v512_64( CB1 );
VA = v512_64( CB2 );
VB = v512_64( CB3 );
VC = v512_64( CB4 ^ 0x280ULL );
VD = v512_64( CB5 ^ 0x280ULL );
VE = v512_64( CB6 );
VF = v512_64( CB7 );
// round 0
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
@@ -565,7 +850,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
v512_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
@@ -578,7 +863,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// round 1
// G1
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( CB8 ),
sc->buf[ 4] ) );
// G2
@@ -588,7 +873,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
// G3
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
v512_64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
@@ -656,7 +941,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// finish round 0 with the nonce now available
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
_mm512_set1_epi64( CB8 ), M9 ) );
v512_64( CB8 ), M9 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
@@ -667,14 +952,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// G1
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
v512_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
@@ -683,12 +968,12 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
v512_64( CBF ), M9 ) );
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
v512_64( CB9 ), MF ), V6 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
@@ -696,13 +981,13 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// G3
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
// v512_64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
v512_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
@@ -744,14 +1029,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
void blake512_8way_init( blake_8way_big_context *sc )
{
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -815,7 +1100,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
buf[ptr>>3] = v512_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -836,9 +1121,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
{
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm512_or_si512( buf[104>>3],
_mm512_set1_epi64( 0x0100000000000000ULL ) );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
v512_64( 0x0100000000000000ULL ) );
buf[112>>3] = v512_64( bswap_64( th ) );
buf[120>>3] = v512_64( bswap_64( tl ) );
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -850,9 +1135,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_512( buf, 112>>3 );
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
buf[104>>3] = v512_64( 0x0100000000000000ULL );
buf[112>>3] = v512_64( bswap_64( th ) );
buf[120>>3] = v512_64( bswap_64( tl ) );
blake64_8way( sc, buf, 128 );
}
@@ -866,14 +1151,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -897,7 +1182,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
sc->buf[ptr64] = v512_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -915,9 +1200,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
sc->buf[13] = v512_64( 0x0100000000000000ULL );
sc->buf[14] = v512_64( bswap_64( th ) );
sc->buf[15] = v512_64( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -933,14 +1218,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
// init
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -964,7 +1249,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
sc->buf[ptr64] = v512_64( 0x8000000000000000ULL );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -982,9 +1267,9 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = m512_one_64;
sc->buf[14] = _mm512_set1_epi64( th );
sc->buf[15] = _mm512_set1_epi64( tl );
sc->buf[13] = v512_64( 1 );
sc->buf[14] = v512_64( th );
sc->buf[15] = v512_64( tl );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -1013,12 +1298,12 @@ blake512_8way_close(void *cc, void *dst)
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
{ \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
v256_64( c1 ), m0 ), b ), a ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
v256_64( c0 ), m1 ), b ), a ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -1054,14 +1339,14 @@ blake512_8way_close(void *cc, void *dst)
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_set1_epi64x( CB0 ); \
V9 = _mm256_set1_epi64x( CB1 ); \
VA = _mm256_set1_epi64x( CB2 ); \
VB = _mm256_set1_epi64x( CB3 ); \
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
V8 = v256_64( CB0 ); \
V9 = v256_64( CB1 ); \
VA = v256_64( CB2 ); \
VB = v256_64( CB3 ); \
VC = v256_64( T0 ^ CB4 ); \
VD = v256_64( T0 ^ CB5 ); \
VE = v256_64( T1 ^ CB6 ); \
VF = v256_64( T1 ^ CB7 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
@@ -1122,18 +1407,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
_mm256_set1_epi64x( CB4 ) );
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
_mm256_set1_epi64x( CB5 ) );
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
_mm256_set1_epi64x( CB6 ) );
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
_mm256_set1_epi64x( CB7 ) );
V8 = v256_64( CB0 );
V9 = v256_64( CB1 );
VA = v256_64( CB2 );
VB = v256_64( CB3 );
VC = _mm256_xor_si256( v256_64( sc->T0 ),
v256_64( CB4 ) );
VD = _mm256_xor_si256( v256_64( sc->T0 ),
v256_64( CB5 ) );
VE = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB6 ) );
VF = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB7 ) );
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
@@ -1188,23 +1473,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
// initial hash
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
// fill buffer
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
sc->buf[10] = v256_64( 0x8000000000000000ULL );
sc->buf[11] = m256_zero;
sc->buf[12] = m256_zero;
sc->buf[13] = m256_one_64;
sc->buf[13] = v256_64( 1 );
sc->buf[14] = m256_zero;
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
sc->buf[15] = v256_64( 80*8 );
// build working variables
V0 = sc->H[0];
@@ -1215,14 +1500,14 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
V5 = sc->H[5];
V6 = sc->H[6];
V7 = sc->H[7];
V8 = _mm256_set1_epi64x( CB0 );
V9 = _mm256_set1_epi64x( CB1 );
VA = _mm256_set1_epi64x( CB2 );
VB = _mm256_set1_epi64x( CB3 );
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
VE = _mm256_set1_epi64x( CB6 );
VF = _mm256_set1_epi64x( CB7 );
V8 = v256_64( CB0 );
V9 = v256_64( CB1 );
VA = v256_64( CB2 );
VB = v256_64( CB3 );
VC = v256_64( CB4 ^ 0x280ULL );
VD = v256_64( CB5 ^ 0x280ULL );
VE = v256_64( CB6 );
VF = v256_64( CB7 );
// round 0
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
@@ -1232,7 +1517,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
v256_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
@@ -1244,7 +1529,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// round 1
// G1
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( v256_64( CB8 ),
sc->buf[ 4] ) );
// G2
@@ -1252,7 +1537,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// G3
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
v256_64( CB6 ), sc->buf[13] ), V7 ) );
// save midstate for second part
midstate[ 0] = V0;
@@ -1319,7 +1604,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
v256_64( CB8 ), M9 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
@@ -1334,19 +1619,19 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
v256_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
v256_64( CBF ), M9 ) );
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
v256_64( CB9 ), MF ), V6 ) );
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
@@ -1356,7 +1641,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
v256_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
@@ -1398,14 +1683,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
void blake512_4way_init( blake_4way_big_context *sc )
{
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1465,7 +1750,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
buf[ptr>>3] = v256_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -1487,9 +1772,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
{
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = _mm256_or_si256( buf[104>>3],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
v256_64( 0x0100000000000000ULL ) );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
@@ -1501,9 +1786,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_256( buf, 112>>3 );
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
buf[104>>3] = v256_64( 0x0100000000000000ULL );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}
@@ -1517,14 +1802,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
// init
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
@@ -1548,7 +1833,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
uint64_t th, tl;
bit_len = sc->ptr << 3;
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
sc->buf[ptr64] = v256_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( sc->ptr == 0 )
@@ -1565,9 +1850,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
sc->T0 -= 1024 - bit_len;
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
sc->buf[13] = v256_64( 0x0100000000000000ULL );
sc->buf[14] = v256_64( bswap_64( th ) );
sc->buf[15] = v256_64( bswap_64( tl ) );
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
@@ -1589,4 +1874,4 @@ blake512_4way_close(void *cc, void *dst)
blake64_4way_close( cc, dst );
}
#endif
#endif // AVX2

View File

@@ -0,0 +1,83 @@
#ifndef BLAKE512_HASH__
#define BLAKE512_HASH__ 1
#include <stddef.h>
#include "simd-utils.h"
/////////////////////////
//
// Blake-512 1 way SSE2 & AVX2
typedef struct {
unsigned char buf[128]; /* first field, for alignment */
uint64_t H[8];
uint64_t T0, T1;
size_t ptr;
} blake512_context __attribute__ ((aligned (32)));
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 );
void blake512_init( blake512_context *sc );
void blake512_update( blake512_context *sc, const void *data, size_t len );
void blake512_close( blake512_context *sc, void *dst );
void blake512_full( blake512_context *sc, void *dst, const void *data,
size_t len );
#ifdef __AVX2__
// Blake-512 4 way AVX2
typedef struct {
__m256i buf[16];
__m256i H[8];
__m256i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_4way_big_context __attribute__ ((aligned (64)));
typedef blake_4way_big_context blake512_4way_context;
void blake512_4way_init( blake_4way_big_context *sc );
void blake512_4way_update( void *cc, const void *data, size_t len );
void blake512_4way_close( void *cc, void *dst );
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
const void *data );
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
////////////////////////////
//
//// Blake-512 8 way AVX512
typedef struct {
__m512i buf[16];
__m512i H[8];
__m512i S[4];
size_t ptr;
uint64_t T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));
typedef blake_8way_big_context blake512_8way_context;
void blake512_8way_init( blake_8way_big_context *sc );
void blake512_8way_update( void *cc, const void *data, size_t len );
void blake512_8way_close( void *cc, void *dst );
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
const void *data, size_t len );
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
const void *data );
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
const __m512i nonce, const __m512i *midstate );
#endif // AVX512
#endif // AVX2
#endif // BLAKE512_HASH_H__

View File

@@ -1,5 +1,5 @@
#include "blakecoin-gate.h"
#include "blake-hash-4way.h"
#include "blake256-hash.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
@@ -30,25 +30,25 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
block0_hash[0] = v512_32( phash[0] );
block0_hash[1] = v512_32( phash[1] );
block0_hash[2] = v512_32( phash[2] );
block0_hash[3] = v512_32( phash[3] );
block0_hash[4] = v512_32( phash[4] );
block0_hash[5] = v512_32( phash[5] );
block0_hash[6] = v512_32( phash[6] );
block0_hash[7] = v512_32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm512_set1_epi32( pdata[16] );
block_buf[1] = _mm512_set1_epi32( pdata[17] );
block_buf[2] = _mm512_set1_epi32( pdata[18] );
block_buf[0] = v512_32( pdata[16] );
block_buf[1] = v512_32( pdata[17] );
block_buf[2] = v512_32( pdata[18] );
block_buf[3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
@@ -101,25 +101,25 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 8;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i eight = v256_32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
block0_hash[0] = v256_32( phash[0] );
block0_hash[1] = v256_32( phash[1] );
block0_hash[2] = v256_32( phash[2] );
block0_hash[3] = v256_32( phash[3] );
block0_hash[4] = v256_32( phash[4] );
block0_hash[5] = v256_32( phash[5] );
block0_hash[6] = v256_32( phash[6] );
block0_hash[7] = v256_32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[0] = v256_32( pdata[16] );
block_buf[1] = v256_32( pdata[17] );
block_buf[2] = v256_32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
@@ -203,142 +203,3 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
#endif
#if 0
//#if defined(BLAKECOIN_8WAY)
blake256r8_8way_context blakecoin_8w_ctx;
void blakecoin_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r8_8way_context ctx;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
blake256r8_8way_close( &ctx, vhash );
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
state+160, state+192, state+224, vhash, 256 );
}
/*
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = (uint32_t*)work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, 8 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
HTarget = 0x7f;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256r8_8way_init( &blakecoin_8w_ctx );
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
blake256r8_8way_close( &ctx, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= HTarget )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -6,9 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake-hash-4way.h"
#include "sph_blake.h"
#include "blake512-hash.h"
extern void pentablakehash_4way( void *output, const void *input )
{

View File

@@ -14,8 +14,9 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "simd-utils.h"
#include "compat/sph_types.h"
#include "compat.h"
#include "sph-blake2s.h"
static const uint32_t blake2s_IV[8] =
@@ -208,8 +209,8 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
uint32_t m[16];
uint32_t v[16];
uint32_t _ALIGN(32) m[16];
uint32_t _ALIGN(32) v[16];
for( size_t i = 0; i < 16; ++i )
m[i] = load32( block + i * sizeof( m[i] ) );
@@ -225,6 +226,58 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__)
__m128i *V = (__m128i*)v;
#define BLAKE2S_ROUND( r ) \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[0] = mm128_shufll_32( V[0] ); \
V[3] = mm128_swap_64( V[3] ); \
V[2] = mm128_shuflr_32( V[2] ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[0] = mm128_shuflr_32( V[0] ); \
V[3] = mm128_swap_64( V[3] ); \
V[2] = mm128_shufll_32( V[2] )
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
#undef BLAKE2S_ROUND
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -236,6 +289,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
c = c + d; \
b = SPH_ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
@@ -247,7 +301,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
@@ -258,6 +313,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
ROUND( 8 );
ROUND( 9 );
#endif
for( size_t i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

View File

@@ -383,65 +383,77 @@ static const uint32_t T512[64][16] = {
#define S1E ME
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way AVX512
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
// on i9-9940x cmplt with zero was 3% faster than movepi.
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
do { \
{ \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
const __m512i zero = m512_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
for ( int i = 0; i < 64*8; i += 8, db = _mm512_ror_epi64( db, 1 ) ) \
{ \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[i+0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[i+1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[i+2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[i+3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[i+4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[i+5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[i+6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[i+7] ) ); \
} \
} while (0)
}
#define SBOX8( a, b, c, d ) \
{ \
__m512i tb, td; \
td = mm512_xorand( d, a, c ); \
c = mm512_xor3( c, td, b ); \
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
__m512i t; \
t = a; \
a = mm512_xorand( d, a, c ); \
c = mm512_xor3( a, b, c ); \
b = mm512_xoror( b, d, t ); \
t = _mm512_xor_si512( t, c ); \
d = mm512_xoror( a, b, t ); \
t = mm512_xorand( t, a, b ); \
__m512i t = mm512_xorand( d, a, c ); \
c = mm512_xor3( c, t, b ); \
b = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
d = mm512_xoror( t, b, a ); \
t = mm512_xorand( a, t, b ); \
a = c; \
c = mm512_xor3( b, d, t ); \
b = d; \
d = mm512_not( t ); \
} while (0)
*/
#define L8( a, b, c, d ) \
do { \
a = mm512_rol_32( a, 13 ); \
c = mm512_rol_32( c, 3 ); \
b = mm512_xor3( a, b, c ); \
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
b = mm512_rol_32( b, 1 ); \
b = mm512_xor3( a, b, c ); \
d = mm512_rol_32( d, 7 ); \
a = mm512_xor3( a, b, d ); \
b = mm512_rol_32( b, 1 ); \
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
a = mm512_rol_32( a, 5 ); \
a = mm512_xor3( a, b, d ); \
c = mm512_rol_32( c, 22 ); \
} while (0)
a = mm512_rol_32( a, 5 );
#define DECL_STATE_BIG8 \
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
@@ -628,7 +640,7 @@ do { /* order is important */ \
///////////////////////
//
// Experimental
// Found to be slower than running 8x64 twice.
// Hamsi 16 way 32 bit.
@@ -674,75 +686,128 @@ do { /* order is important */ \
#define INPUT_16X32 \
{ \
const __m512i zero = _mm512_setzero_si512(); \
const uint32_t *tp = (const uint32_t*)T512; \
const __m512i zero = (const __m512i)_mm512_setzero_si512(); \
const uint64_t *tp = (const uint64_t*)T512; \
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
for ( int v = 0; v < 2; v++ ) \
__m512i db = _mm512_ror_epi32( buf[0], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
__m512i db = _mm512_ror_epi32( buf[v], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
__mmask16 dm = _mm512_cmplt_epi32_mask( db, zero ); \
M0 = _mm512_mask_xor_epi32( M0, dm, M0, v512_32( tp[ 0] ) ); \
M1 = _mm512_mask_xor_epi32( M1, dm, M1, v512_32( tp[ 1] ) ); \
M2 = _mm512_mask_xor_epi32( M2, dm, M2, v512_32( tp[ 2] ) ); \
M3 = _mm512_mask_xor_epi32( M3, dm, M3, v512_32( tp[ 3] ) ); \
M4 = _mm512_mask_xor_epi32( M4, dm, M4, v512_32( tp[ 4] ) ); \
M5 = _mm512_mask_xor_epi32( M5, dm, M5, v512_32( tp[ 5] ) ); \
M6 = _mm512_mask_xor_epi32( M6, dm, M6, v512_32( tp[ 6] ) ); \
M7 = _mm512_mask_xor_epi32( M7, dm, M7, v512_32( tp[ 7] ) ); \
M8 = _mm512_mask_xor_epi32( M8, dm, M8, v512_32( tp[ 8] ) ); \
M9 = _mm512_mask_xor_epi32( M9, dm, M9, v512_32( tp[ 9] ) ); \
MA = _mm512_mask_xor_epi32( MA, dm, MA, v512_32( tp[10] ) ); \
MB = _mm512_mask_xor_epi32( MB, dm, MB, v512_32( tp[11] ) ); \
MC = _mm512_mask_xor_epi32( MC, dm, MC, v512_32( tp[12] ) ); \
MD = _mm512_mask_xor_epi32( MD, dm, MD, v512_32( tp[13] ) ); \
ME = _mm512_mask_xor_epi32( ME, dm, ME, v512_32( tp[14] ) ); \
MF = _mm512_mask_xor_epi32( MF, dm, MF, v512_32( tp[15] ) ); \
db = _mm512_ror_epi32( db, 1 ); \
tp += 16; \
} \
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero );\
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
db = _mm512_ror_epi32( db, 1 ); \
tp += 8; \
} \
db = _mm512_ror_epi32( buf[1], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero ); \
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
db = _mm512_ror_epi32( db, 1 ); \
tp += 8; \
} \
}
#define SBOX_16X32 SBOX8
#define L_16X32 L8
#define ROUND_16X32( rc, alpha ) \
#define ROUND_16X32( alpha ) \
{ \
S00 = _mm512_xor_si512( S00, v512_32( alpha[ 0] ) ); \
S01 = _mm512_xor_si512( S01, v512_32( alpha[ 1] ^ rc ) ); \
S02 = _mm512_xor_si512( S02, v512_32( alpha[ 2] ) ); \
S03 = _mm512_xor_si512( S03, v512_32( alpha[ 3] ) ); \
S04 = _mm512_xor_si512( S04, v512_32( alpha[ 4] ) ); \
S05 = _mm512_xor_si512( S05, v512_32( alpha[ 5] ) ); \
S06 = _mm512_xor_si512( S06, v512_32( alpha[ 6] ) ); \
S07 = _mm512_xor_si512( S07, v512_32( alpha[ 7] ) ); \
S08 = _mm512_xor_si512( S08, v512_32( alpha[ 8] ) ); \
S09 = _mm512_xor_si512( S09, v512_32( alpha[ 9] ) ); \
S0A = _mm512_xor_si512( S0A, v512_32( alpha[10] ) ); \
S0B = _mm512_xor_si512( S0B, v512_32( alpha[11] ) ); \
S0C = _mm512_xor_si512( S0C, v512_32( alpha[12] ) ); \
S0D = _mm512_xor_si512( S0D, v512_32( alpha[13] ) ); \
S0E = _mm512_xor_si512( S0E, v512_32( alpha[14] ) ); \
S0F = _mm512_xor_si512( S0F, v512_32( alpha[15] ) ); \
S10 = _mm512_xor_si512( S10, v512_32( alpha[16] ) ); \
S11 = _mm512_xor_si512( S11, v512_32( alpha[17] ) ); \
S12 = _mm512_xor_si512( S12, v512_32( alpha[18] ) ); \
S13 = _mm512_xor_si512( S13, v512_32( alpha[19] ) ); \
S14 = _mm512_xor_si512( S14, v512_32( alpha[20] ) ); \
S15 = _mm512_xor_si512( S15, v512_32( alpha[21] ) ); \
S16 = _mm512_xor_si512( S16, v512_32( alpha[22] ) ); \
S17 = _mm512_xor_si512( S17, v512_32( alpha[23] ) ); \
S18 = _mm512_xor_si512( S18, v512_32( alpha[24] ) ); \
S19 = _mm512_xor_si512( S19, v512_32( alpha[25] ) ); \
S1A = _mm512_xor_si512( S1A, v512_32( alpha[26] ) ); \
S1B = _mm512_xor_si512( S1B, v512_32( alpha[27] ) ); \
S1C = _mm512_xor_si512( S1C, v512_32( alpha[28] ) ); \
S1D = _mm512_xor_si512( S1D, v512_32( alpha[29] ) ); \
S1E = _mm512_xor_si512( S1E, v512_32( alpha[30] ) ); \
S1F = _mm512_xor_si512( S1F, v512_32( alpha[31] ) ); \
S00 = _mm512_xor_si512( S00, alpha[ 0] ); \
S01 = _mm512_xor_si512( S01, alpha[ 1] ); \
S02 = _mm512_xor_si512( S02, alpha[ 2] ); \
S03 = _mm512_xor_si512( S03, alpha[ 3] ); \
S04 = _mm512_xor_si512( S04, alpha[ 4] ); \
S05 = _mm512_xor_si512( S05, alpha[ 5] ); \
S06 = _mm512_xor_si512( S06, alpha[ 6] ); \
S07 = _mm512_xor_si512( S07, alpha[ 7] ); \
S08 = _mm512_xor_si512( S08, alpha[ 8] ); \
S09 = _mm512_xor_si512( S09, alpha[ 9] ); \
S0A = _mm512_xor_si512( S0A, alpha[10] ); \
S0B = _mm512_xor_si512( S0B, alpha[11] ); \
S0C = _mm512_xor_si512( S0C, alpha[12] ); \
S0D = _mm512_xor_si512( S0D, alpha[13] ); \
S0E = _mm512_xor_si512( S0E, alpha[14] ); \
S0F = _mm512_xor_si512( S0F, alpha[15] ); \
S10 = _mm512_xor_si512( S10, alpha[16] ); \
S11 = _mm512_xor_si512( S11, alpha[17] ); \
S12 = _mm512_xor_si512( S12, alpha[18] ); \
S13 = _mm512_xor_si512( S13, alpha[19] ); \
S14 = _mm512_xor_si512( S14, alpha[20] ); \
S15 = _mm512_xor_si512( S15, alpha[21] ); \
S16 = _mm512_xor_si512( S16, alpha[22] ); \
S17 = _mm512_xor_si512( S17, alpha[23] ); \
S18 = _mm512_xor_si512( S18, alpha[24] ); \
S19 = _mm512_xor_si512( S19, alpha[25] ); \
S1A = _mm512_xor_si512( S1A, alpha[26] ); \
S1B = _mm512_xor_si512( S1B, alpha[27] ); \
S1C = _mm512_xor_si512( S1C, alpha[28] ); \
S1D = _mm512_xor_si512( S1D, alpha[29] ); \
S1E = _mm512_xor_si512( S1E, alpha[30] ); \
S1F = _mm512_xor_si512( S1F, alpha[31] ); \
SBOX_16X32( S00, S08, S10, S18 ); \
SBOX_16X32( S01, S09, S11, S19 ); \
SBOX_16X32( S02, S0A, S12, S1A ); \
@@ -766,26 +831,54 @@ do { /* order is important */ \
}
#define P_16X32 \
ROUND_16X32( 0, alpha_n ); \
ROUND_16X32( 1, alpha_n ); \
ROUND_16X32( 2, alpha_n ); \
ROUND_16X32( 3, alpha_n ); \
ROUND_16X32( 4, alpha_n ); \
ROUND_16X32( 5, alpha_n );
{ \
__m512i alpha[32]; \
const uint32_t A1 = ( (const uint32_t*)alpha_n )[1]; \
for( int i = 0; i < 32; i++ ) \
alpha[i] = v512_32( ( (uint32_t*)alpha_n )[i] ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 1 ^ (A1) ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 2 ^ (A1) ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 3 ^ (A1) ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 4 ^ (A1) ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 5 ^ (A1) ); \
ROUND_16X32( alpha ); \
}
#define PF_16X32 \
ROUND_16X32( 0, alpha_f ); \
ROUND_16X32( 1, alpha_f ); \
ROUND_16X32( 2, alpha_f ); \
ROUND_16X32( 3, alpha_f ); \
ROUND_16X32( 4, alpha_f ); \
ROUND_16X32( 5, alpha_f ); \
ROUND_16X32( 6, alpha_f ); \
ROUND_16X32( 7, alpha_f ); \
ROUND_16X32( 8, alpha_f ); \
ROUND_16X32( 9, alpha_f ); \
ROUND_16X32( 10, alpha_f ); \
ROUND_16X32( 11, alpha_f );
{ \
__m512i alpha[32]; \
const uint32_t A1 = ( (const uint32_t*)alpha_f )[1]; \
for( int i = 0; i < 32; i++ ) \
alpha[i] = v512_32( ( (uint32_t*)alpha_f )[i] ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 1 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 2 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 3 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 4 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 5 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 6 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 7 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 8 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 9 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 10 ^ A1 ); \
ROUND_16X32( alpha ); \
alpha[1] = v512_32( 11 ^ A1 ); \
ROUND_16X32( alpha ); \
}
#define T_16X32 \
/* order is important */ \
@@ -841,7 +934,7 @@ void hamsi_16x32_big_final( hamsi_16x32_big_context *sc, __m512i *buf )
WRITE_STATE_16X32( sc );
}
void hamsi512_16x32_init( hamsi_16x32_big_context *sc )
void hamsi512_16x32_init( hamsi512_16x32_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
@@ -863,7 +956,7 @@ void hamsi512_16x32_init( hamsi_16x32_big_context *sc )
sc->h[15] = v512_32( HAMSI_IV512[15] );
}
void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data,
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -875,7 +968,7 @@ void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data,
sc->partial_len = len;
}
void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst )
void hamsi512_16x32_close( hamsi512_16x32_context *sc, void *dst )
{
__m512i pad[2];
uint32_t ch, cl;
@@ -893,7 +986,7 @@ void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst )
mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
}
void hamsi512_16x32_full( hamsi_16x32_big_context *sc, void *dst,
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
const void *data, size_t len )
{
// init
@@ -996,16 +1089,6 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
sc->h[5] = v512_64( iv[5] );
sc->h[6] = v512_64( iv[6] );
sc->h[7] = v512_64( iv[7] );
/*
sc->h[0] = v512_64( 0x6c70617273746565 );
sc->h[1] = v512_64( 0x656e62656b204172 );
sc->h[2] = v512_64( 0x302c206272672031 );
sc->h[3] = v512_64( 0x3434362c75732032 );
sc->h[4] = v512_64( 0x3030312020422d33 );
sc->h[5] = v512_64( 0x656e2d484c657576 );
sc->h[6] = v512_64( 0x6c65652c65766572 );
sc->h[7] = v512_64( 0x6769756d2042656c );
*/
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
@@ -1047,22 +1130,34 @@ do { \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
for ( int i = 0; i < 64*8; i+=8, db = _mm256_ror_epi64( db, 1 ) ) \
{ \
const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[0] ) ); \
m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[1] ) ); \
m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[2] ) ); \
m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[3] ) ); \
m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[4] ) ); \
m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[5] ) ); \
m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[6] ) ); \
m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[7] ) ); \
db = _mm256_ror_epi64( db, 1 ); \
tp += 8; \
m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[i+0] ) ); \
m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[i+1] ) ); \
m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[i+2] ) ); \
m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[i+3] ) ); \
m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[i+4] ) ); \
m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[i+5] ) ); \
m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[i+6] ) ); \
m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[i+7] ) ); \
} \
} while (0)
// v3 ternary logic, 8 instructions, 2 local vars
#define SBOX( a, b, c, d ) \
{ \
__m256i tb, td; \
td = mm256_xorand( d, a, c ); \
tb = mm256_xoror( b, d, a ); \
c = mm256_xor3( c, td, b ); \
a = _mm256_xor_si256( a, c ); \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
}
#else
#define INPUT_BIG \
@@ -1071,46 +1166,54 @@ do { \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 63; u >= 0; u-- ) \
for ( int i = 63; i >= 0; i-- ) \
{ \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
v256_64( tp[7] ) ) ); \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, i ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
} \
} while (0)
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
{ \
__m256i tb, td; \
td = mm256_xorand( d, a, c ); \
tb = mm256_xoror( b, d, a ); \
c = mm256_xor3( c, td, b ); \
a = _mm256_xor_si256( a, c ); \
b = mm256_xoror( td, tb, a ); \
td = mm256_xorand( a, td, tb ); \
a = c; \
c = mm256_xor3( tb, b, td ); \
d = mm256_not( td ); \
}
#endif
/*
/ v2, 16 instructions, 10 TL equivalent instructions
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
t = a; \
a = mm256_xorand( d, a, c ); \
c = mm256_xor3( a, b, c ); \
b = mm256_xoror( b, d, t ); \
t = _mm256_xor_si256( t, c ); \
d = mm256_xoror( a, b, t ); \
t = mm256_xorand( t, a, b ); \
{ \
__m256i t = mm256_xorand( d, a, c ); \
c = mm256_xor3( t, b, c ); \
b = mm256_xoror( b, d, a); \
a = _mm256_xor_si256( a, c ); \
d = mm256_xoror( t, b, a ); \
t = mm256_xorand( a, t, b ); \
a = c; \
c = mm256_xor3( b, d, t ); \
b = d; \
d = mm256_not( t ); \
} while (0)
}
*/
#define L( a, b, c, d ) \
do { \
@@ -1127,6 +1230,7 @@ do { \
} while (0)
/*
// original, 18 instructions
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
@@ -1405,31 +1509,51 @@ do { /* order is important */ \
const uint32_t *tp = (const uint32_t*)T512; \
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
for ( int v = 0; v < 2; v++ ) \
__m256i db = _mm256_ror_epi32( buf[0], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
__m256i db = _mm256_ror_epi32( buf[v], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
db = _mm256_ror_epi32( db, 1 ); \
tp += 16; \
} \
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
db = _mm256_ror_epi32( db, 1 ); \
tp += 16; \
} \
db = _mm256_ror_epi32( buf[1], 1 ); \
for ( int u = 0; u < 32; u++ ) \
{ \
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
db = _mm256_ror_epi32( db, 1 ); \
tp += 16; \
} \
}
@@ -1441,30 +1565,49 @@ do { /* order is important */ \
const uint32_t *tp = (const uint32_t*)T512; \
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
for ( int v = 0; v < 2; v++ ) \
__m256i db = buf[0]; \
for ( int u = 31; u >= 0; u-- ) \
{ \
__m256i db = buf[v]; \
for ( int u = 31; u >= 0; u-- ) \
{ \
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
tp += 16; \
} \
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
tp += 16; \
} \
db = buf[1]; \
for ( int u = 31; u >= 0; u-- ) \
{ \
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
tp += 16; \
} \
}

View File

@@ -0,0 +1,115 @@
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
/*
* Helper code, included (three times !) by HAVAL implementation.
*
* TODO: try to merge this with md_helper.c.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_update)
( haval_16way_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
while ( len > 0 )
{
unsigned clen;
uint32_t clow, clow2;
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_512( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
if ( current == 128U )
{
DSTATE_16W;
IN_PREPARE_16W(sc->buf);
RSTATE_16W;
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
WSTATE_16W;
current = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high ++;
}
}
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_close)( haval_16way_context *sc,
void *dst)
{
unsigned current;
DSTATE_16W;
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = v512_32( 1 );
current += 4;
RSTATE_16W;
if ( current > 116UL )
{
memset_zero_512( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE_16W(sc->buf);
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
} while (0);
current = 0;
}
uint32_t t1, t2;
memset_zero_512( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = v512_32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = v512_32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = v512_32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{
IN_PREPARE_16W(sc->buf);
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
} while (0);
WSTATE_16W;
haval_16way_out( sc, dst );
}

View File

@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = m128_one_32;
sc->buf[ current>>2 ] = v128_32( 1 );
current += 4;
RSTATE;
if ( current > 116UL )

View File

@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = m256_one_32;
sc->buf[ current>>2 ] = v256_32( 1 );
current += 4;
RSTATE_8W;
if ( current > 116UL )
@@ -101,9 +101,9 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
sc->buf[ 116>>2 ] = v256_32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = v256_32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = v256_32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{

View File

@@ -189,7 +189,7 @@ do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), \
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
_mm_add_epi32( w, v128_32( c ) ) ); \
} while (0)
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
@@ -471,14 +471,14 @@ do { \
static void
haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
sc->s3 = _mm_set1_epi32( 0x03707344UL );
sc->s4 = _mm_set1_epi32( 0xA4093822UL );
sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
sc->s0 = v128_32( 0x243F6A88UL );
sc->s1 = v128_32( 0x85A308D3UL );
sc->s2 = v128_32( 0x13198A2EUL );
sc->s3 = v128_32( 0x03707344UL );
sc->s4 = v128_32( 0xA4093822UL );
sc->s5 = v128_32( 0x299F31D0UL );
sc->s6 = v128_32( 0x082EFA98UL );
sc->s7 = v128_32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
@@ -662,7 +662,7 @@ do { \
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
mm256_ror_32( x7, 11 ) ), \
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
_mm256_add_epi32( w, v256_32( c ) ) ); \
} while (0)
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
@@ -793,14 +793,14 @@ do { \
static void
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
sc->s3 = _mm256_set1_epi32( 0x03707344UL );
sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
sc->s0 = v256_32( 0x243F6A88UL );
sc->s1 = v256_32( 0x85A308D3UL );
sc->s2 = v256_32( 0x13198A2EUL );
sc->s3 = v256_32( 0x03707344UL );
sc->s4 = v256_32( 0xA4093822UL );
sc->s5 = v256_32( 0x299F31D0UL );
sc->s6 = v256_32( 0x082EFA98UL );
sc->s7 = v256_32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
@@ -879,10 +879,300 @@ do { \
#define INMSG_8W(i) msg[i]
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \
_mm512_ternarylogic_epi32( a, b, c, 0x82 )
#define F1_16W(x6, x5, x4, x3, x2, x1, x0) \
mm512_xor3( x0, mm512_andxor( x1, x0, x4 ), \
_mm512_xor_si512( _mm512_and_si512( x2, x5 ), \
_mm512_and_si512( x3, x6 ) ) ) \
#define F2_16W(x6, x5, x4, x3, x2, x1, x0) \
mm512_xor3( mm512_andxor( x2, _mm512_andnot_si512( x3, x1 ), \
mm512_xor3( _mm512_and_si512( x4, x5 ), x6, x0 ) ), \
mm512_andxor( x4, x1, x5 ), \
mm512_xorand( x0, x3, x5 ) ) \
#define F3_16W(x6, x5, x4, x3, x2, x1, x0) \
mm512_xor3( x0, \
_mm512_and_si512( x3, \
mm512_xor3( _mm512_and_si512( x1, x2 ), x6, x0 ) ), \
_mm512_xor_si512( _mm512_and_si512( x1, x4 ), \
_mm512_and_si512( x2, x5 ) ) )
#define F4_16W(x6, x5, x4, x3, x2, x1, x0) \
mm512_xor3( \
mm512_andxor( x3, x5, \
_mm512_xor_si512( _mm512_and_si512( x1, x2 ), \
_mm512_or_si512( x4, x6 ) ) ), \
_mm512_and_si512( x4, \
mm512_xor3( x0, _mm512_andnot_si512( x2, x5 ), \
_mm512_xor_si512( x1, x6 ) ) ), \
mm512_xorand( x0, x2, x6 ) )
#define F5_16W(x6, x5, x4, x3, x2, x1, x0) \
_mm512_xor_si512( \
mm512_andnotxor( mm512_and3( x1, x2, x3 ), x5, x0 ), \
mm512_xor3( _mm512_and_si512( x1, x4 ), \
_mm512_and_si512( x2, x5 ), \
_mm512_and_si512( x3, x6 ) ) )
#define FP3_1_16W(x6, x5, x4, x3, x2, x1, x0) \
F1_16W(x1, x0, x3, x5, x6, x2, x4)
#define FP3_2_16W(x6, x5, x4, x3, x2, x1, x0) \
F2_16W(x4, x2, x1, x0, x5, x3, x6)
#define FP3_3_16W(x6, x5, x4, x3, x2, x1, x0) \
F3_16W(x6, x1, x2, x3, x4, x5, x0)
#define FP4_1_16W(x6, x5, x4, x3, x2, x1, x0) \
F1_16W(x2, x6, x1, x4, x5, x3, x0)
#define FP4_2_16W(x6, x5, x4, x3, x2, x1, x0) \
F2_16W(x3, x5, x2, x0, x1, x6, x4)
#define FP4_3_16W(x6, x5, x4, x3, x2, x1, x0) \
F3_16W(x1, x4, x3, x6, x0, x2, x5)
#define FP4_4_16W(x6, x5, x4, x3, x2, x1, x0) \
F4_16W(x6, x4, x0, x5, x2, x1, x3)
#define FP5_1_16W(x6, x5, x4, x3, x2, x1, x0) \
F1_16W(x3, x4, x1, x0, x5, x2, x6)
#define FP5_2_16W(x6, x5, x4, x3, x2, x1, x0) \
F2_16W(x6, x2, x1, x0, x3, x4, x5)
#define FP5_3_16W(x6, x5, x4, x3, x2, x1, x0) \
F3_16W(x2, x6, x0, x4, x3, x1, x5)
#define FP5_4_16W(x6, x5, x4, x3, x2, x1, x0) \
F4_16W(x1, x5, x3, x2, x0, x4, x6)
#define FP5_5_16W(x6, x5, x4, x3, x2, x1, x0) \
F5_16W(x2, x5, x0, x6, x4, x3, x1)
#define STEP_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
mm512_ror_32( x7, 11 ) ), \
_mm512_add_epi32( w, v512_32( c ) ) ); \
} while (0)
#define STEP1_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
do { \
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
mm512_ror_32( x7, 11 ) ), w ); \
} while (0)
#define PASS1_16W(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP1_16W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0) ); \
STEP1_16W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1) ); \
STEP1_16W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2) ); \
STEP1_16W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3) ); \
STEP1_16W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4) ); \
STEP1_16W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5) ); \
STEP1_16W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6) ); \
STEP1_16W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7) ); \
} \
} while (0)
#define PASSG_16W(p, n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_16W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
in(MP ## p[pass_count + 0]), \
RK ## p[pass_count + 0]); \
STEP_16W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
in(MP ## p[pass_count + 1]), \
RK ## p[pass_count + 1]); \
STEP_16W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
in(MP ## p[pass_count + 2]), \
RK ## p[pass_count + 2]); \
STEP_16W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
in(MP ## p[pass_count + 3]), \
RK ## p[pass_count + 3]); \
STEP_16W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
in(MP ## p[pass_count + 4]), \
RK ## p[pass_count + 4]); \
STEP_16W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
in(MP ## p[pass_count + 5]), \
RK ## p[pass_count + 5]); \
STEP_16W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
in(MP ## p[pass_count + 6]), \
RK ## p[pass_count + 6]); \
STEP_16W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
in(MP ## p[pass_count + 7]), \
RK ## p[pass_count + 7]); \
} \
} while (0)
#define PASS2_16W(n, in) PASSG_16W(2, n, in)
#define PASS3_16W(n, in) PASSG_16W(3, n, in)
#define PASS4_16W(n, in) PASSG_16W(4, n, in)
#define PASS5_16W(n, in) PASSG_16W(5, n, in)
#define SAVE_STATE_16W \
__m512i u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
u2 = s2; \
u3 = s3; \
u4 = s4; \
u5 = s5; \
u6 = s6; \
u7 = s7; \
} while (0)
#define UPDATE_STATE_16W \
do { \
s0 = _mm512_add_epi32( s0, u0 ); \
s1 = _mm512_add_epi32( s1, u1 ); \
s2 = _mm512_add_epi32( s2, u2 ); \
s3 = _mm512_add_epi32( s3, u3 ); \
s4 = _mm512_add_epi32( s4, u4 ); \
s5 = _mm512_add_epi32( s5, u5 ); \
s6 = _mm512_add_epi32( s6, u6 ); \
s7 = _mm512_add_epi32( s7, u7 ); \
} while (0)
#define CORE_16W5(in) do { \
SAVE_STATE_16W; \
PASS1_16W(5, in); \
PASS2_16W(5, in); \
PASS3_16W(5, in); \
PASS4_16W(5, in); \
PASS5_16W(5, in); \
UPDATE_STATE_16W; \
} while (0)
#define DSTATE_16W __m512i s0, s1, s2, s3, s4, s5, s6, s7
#define RSTATE_16W \
do { \
s0 = sc->s0; \
s1 = sc->s1; \
s2 = sc->s2; \
s3 = sc->s3; \
s4 = sc->s4; \
s5 = sc->s5; \
s6 = sc->s6; \
s7 = sc->s7; \
} while (0)
#define WSTATE_16W \
do { \
sc->s0 = s0; \
sc->s1 = s1; \
sc->s2 = s2; \
sc->s3 = s3; \
sc->s4 = s4; \
sc->s5 = s5; \
sc->s6 = s6; \
sc->s7 = s7; \
} while (0)
static void
haval_16way_init( haval_16way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = v512_32( 0x243F6A88UL );
sc->s1 = v512_32( 0x85A308D3UL );
sc->s2 = v512_32( 0x13198A2EUL );
sc->s3 = v512_32( 0x03707344UL );
sc->s4 = v512_32( 0xA4093822UL );
sc->s5 = v512_32( 0x299F31D0UL );
sc->s6 = v512_32( 0x082EFA98UL );
sc->s7 = v512_32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
sc->count_low = 0;
}
#define IN_PREPARE_16W(indata) const __m512i *const load_ptr_16w = (indata)
#define INW_16W(i) load_ptr_16w[ i ]
static void
haval_16way_out( haval_16way_context *sc, void *dst )
{
__m512i *buf = (__m512i*)dst;
DSTATE_16W;
RSTATE_16W;
buf[0] = s0;
buf[1] = s1;
buf[2] = s2;
buf[3] = s3;
buf[4] = s4;
buf[5] = s5;
buf[6] = s6;
buf[7] = s7;
}
#undef PASSES
#define PASSES 5
#include "haval-16way-helper.c"
#define API_16W(xxx, y) \
void \
haval ## xxx ## _ ## y ## _16way_init(void *cc) \
{ \
haval_16way_init(cc, xxx >> 5, y); \
} \
\
void \
haval ## xxx ## _ ## y ## _16way_update (void *cc, const void *data, size_t len) \
{ \
haval ## y ## _16way_update(cc, data, len); \
} \
\
void \
haval ## xxx ## _ ## y ## _16way_close(void *cc, void *dst) \
{ \
haval ## y ## _16way_close(cc, dst); \
} \
API_16W(256, 5)
#define RVAL_16W \
do { \
s0 = val[0]; \
s1 = val[1]; \
s2 = val[2]; \
s3 = val[3]; \
s4 = val[4]; \
s5 = val[5]; \
s6 = val[6]; \
s7 = val[7]; \
} while (0)
#define WVAL_16W \
do { \
val[0] = s0; \
val[1] = s1; \
val[2] = s2; \
val[3] = s3; \
val[4] = s4; \
val[5] = s5; \
val[6] = s6; \
val[7] = s7; \
} while (0)
#define INMSG_16W(i) msg[i]
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -107,6 +107,25 @@ void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
__m512i buf[32];
__m512i s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
uint32_t count_high, count_low;
} haval_16way_context __attribute__ ((aligned (64)));
typedef haval_16way_context haval256_5_16way_context;
void haval256_5_16way_init( void *cc );
void haval256_5_16way_update( void *cc, const void *data, size_t len );
void haval256_5_16way_close( void *cc, void *dst );
#endif // AVX512
#ifdef __cplusplus
}
#endif

View File

@@ -6,7 +6,7 @@
#if defined(JHA_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"

View File

@@ -33,8 +33,7 @@
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, \
_mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}

View File

@@ -1,7 +1,7 @@
#include "lyra2-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake256-hash.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"

View File

@@ -5,8 +5,7 @@
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
//#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake256-hash.h"
__thread uint64_t* lyra2h_4way_matrix;

View File

@@ -1,6 +1,6 @@
#include "lyra2-gate.h"
#include <memory.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake256-hash.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"

View File

@@ -1,7 +1,7 @@
#include "lyra2-gate.h"
#include <memory.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake256-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"

View File

@@ -2,8 +2,7 @@
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake256-hash.h"
#if defined(LYRA2Z_16WAY)

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"

View File

@@ -98,7 +98,7 @@ do { \
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
#define SIGMA_ALL_4W do { \
a0 = _mm_xor_si128( g0, m128_one_32 ); \
a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
@@ -268,7 +268,7 @@ panama_4way_close( void *cc, void *dst )
sc = cc;
current = sc->data_ptr;
*(__m128i*)( sc->data + current ) = m128_one_32;
*(__m128i*)( sc->data + current ) = v128_32( 1 );
current++;
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
panama_4way_push( sc, sc->data, 1 );
@@ -354,7 +354,7 @@ do { \
#define SIGMA_ALL_8W do { \
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
a0 = _mm256_xor_si256( g0, v256_32( 1 ) ); \
a1 = _mm256_xor_si256( g1, INW2( 0 ) ); \
a2 = _mm256_xor_si256( g2, INW2( 1 ) ); \
a3 = _mm256_xor_si256( g3, INW2( 2 ) ); \
@@ -521,7 +521,7 @@ panama_8way_close( void *cc, void *dst )
sc = cc;
current = sc->data_ptr;
*(__m256i*)( sc->data + current ) = m256_one_32;
*(__m256i*)( sc->data + current ) = v256_32( 1 );
current++;
memset_zero_256( (__m256i*)sc->data + current, 32 - current );
panama_8way_push( sc, sc->data, 1 );

View File

@@ -3,7 +3,7 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"

View File

@@ -1,7 +1,7 @@
#include "hmq1725-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"

View File

@@ -6,13 +6,12 @@
#include "sha256-hash.h"
#include "compat.h"
/*
static const uint32_t H256[8] =
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
*/
static const uint32_t K256[64] =
{
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i T1, T2; \
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
__m128i K = v128_32( K256[( (j)+(i) )] ); \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
K, W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
T0 = _mm_add_epi32( v128_32( K256[58] ),
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = _mm_add_epi32( B, T0 );
T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
T1 = _mm_add_epi32( v128_32( K256[59] ),
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = _mm_add_epi32( A, T1 );
T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
T2 = _mm_add_epi32( v128_32( K256[60] ),
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = _mm_add_epi32( H, T2 );
targ = _mm_set1_epi32( target[7] );
targ = v128_32( target[7] );
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
// round 61 part 1
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
T0 = _mm_add_epi32( v128_32( K256[61] ),
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = _mm_add_epi32( G, T0 );
if ( t6_mask )
{
targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
if ( ( 0 != ( t6_mask & mm128_movmask_32(
@@ -440,14 +439,14 @@ return 1;
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
sc->val[0] = v128_32( sha256_iv[0] );
sc->val[1] = v128_32( sha256_iv[1] );
sc->val[2] = v128_32( sha256_iv[2] );
sc->val[3] = v128_32( sha256_iv[3] );
sc->val[4] = v128_32( sha256_iv[4] );
sc->val[5] = v128_32( sha256_iv[5] );
sc->val[6] = v128_32( sha256_iv[6] );
sc->val[7] = v128_32( sha256_iv[7] );
}
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
W[ i ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
@@ -614,7 +613,7 @@ do { \
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
@@ -634,7 +633,7 @@ do { \
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
v256_32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \
@@ -643,7 +642,7 @@ do { \
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
@@ -666,7 +665,7 @@ do { \
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
W[ i0 ] ); \
__m256i T1 = BSG2_1x( E ); \
__m256i T2 = BSG2_0x( A ); \
@@ -677,7 +676,7 @@ do { \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
\
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
W[ (i1) ] ); \
T1 = BSG2_1x( D ); \
T2 = BSG2_0x( H ); \
@@ -790,7 +789,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
// round 3 part 1, avoid nonces W[3]
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
_mm256_set1_epi32( K256[3] ) );
v256_32( K256[3] ) );
A = _mm256_add_epi32( A, T1 );
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
MAJx(F, G, H) ) );
@@ -910,12 +909,11 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target )
{
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
int flip;
int t6_mask;
__m256i vmask, targ, hash;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
uint8_t flip, t6_mask;
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in+1 );
@@ -991,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// round 58 to 60 part 1
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
T0 = _mm256_add_epi32( v256_32( K256[58] ),
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
B = _mm256_add_epi32( B, T0 );
T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
T1 = _mm256_add_epi32( v256_32( K256[59] ),
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
A = _mm256_add_epi32( A, T1 );
T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
T2 = _mm256_add_epi32( v256_32( K256[60] ),
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
H = _mm256_add_epi32( H, T2 );
// Got H, test it.
targ = _mm256_set1_epi32( target[7] );
targ = v256_32( target[7] );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^
if ( target[7] )
{
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
return 0;
return 0;
}
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
// round 58 part 2
@@ -1018,14 +1018,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
MAJx( G, H, A ) ) );
// round 61 part 1
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
T0 = _mm256_add_epi32( v256_32( K256[61] ),
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
G = _mm256_add_epi32( G, T0 );
if ( t6_mask )
{
// Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
@@ -1078,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
void sha256_8way_init( sha256_8way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sc->val[0] = v256_32( sha256_iv[0] );
sc->val[1] = v256_32( sha256_iv[1] );
sc->val[2] = v256_32( sha256_iv[2] );
sc->val[3] = v256_32( sha256_iv[3] );
sc->val[4] = v256_32( sha256_iv[4] );
sc->val[5] = v256_32( sha256_iv[5] );
sc->val[6] = v256_32( sha256_iv[6] );
sc->val[7] = v256_32( sha256_iv[7] );
}
// need to handle odd byte length for yespower.
@@ -1131,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1147,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
@@ -1210,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
__m512i T1 = BSG2_1x16( E ); \
__m512i T2 = BSG2_0x16( A ); \
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
@@ -1224,7 +1224,7 @@ do { \
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
v512_32( K256[(i)+(j)] ) ); \
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
@@ -1234,7 +1234,7 @@ do { \
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
__m512i K = v512_32( K256[( (j)+(i) )] ); \
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
@@ -1345,7 +1345,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
// round 3 part 1, avoid nonces W[3]
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
_mm512_set1_epi32( K256[3] ) );
v512_32( K256[3] ) );
A = _mm512_add_epi32( A, T1 );
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
MAJx16(F, G, H) ) );
@@ -1566,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
// rounds 58 to 60 part 1
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
T0 = _mm512_add_epi32( v512_32( K256[58] ),
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
B = _mm512_add_epi32( B, T0 );
T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
T1 = _mm512_add_epi32( v512_32( K256[59] ),
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
A = _mm512_add_epi32( A, T1 );
T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
T2 = _mm512_add_epi32( v512_32( K256[60] ),
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
H = _mm512_add_epi32( H, T2 );
// got H, test it against target[7]
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
targ = _mm512_set1_epi32( target[7] );
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
targ = v512_32( target[7] );
if ( target[7] )
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
return 0;
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
@@ -1591,15 +1592,15 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// round 61 part 1
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
T0 = _mm512_add_epi32( v512_32( K256[61] ),
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
G = _mm512_add_epi32( G, T0 );
// got G, test it against target[6] if indicated
if ( t6_mask != 0 )
if ( (uint16_t)t6_mask )
{
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
targ = _mm512_set1_epi32( target[6] );
targ = v512_32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
return 0;
}
@@ -1637,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
void sha256_16way_init( sha256_16way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sc->val[0] = v512_32( sha256_iv[0] );
sc->val[1] = v512_32( sha256_iv[1] );
sc->val[2] = v512_32( sha256_iv[2] );
sc->val[3] = v512_32( sha256_iv[3] );
sc->val[4] = v512_32( sha256_iv[4] );
sc->val[5] = v512_32( sha256_iv[5] );
sc->val[6] = v512_32( sha256_iv[6] );
sc->val[7] = v512_32( sha256_iv[7] );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1688,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1704,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );

File diff suppressed because it is too large Load Diff

View File

@@ -118,10 +118,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -130,42 +130,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
istate[0] = v512_32( sha256_iv[0] );
istate[1] = v512_32( sha256_iv[1] );
istate[2] = v512_32( sha256_iv[2] );
istate[3] = v512_32( sha256_iv[3] );
istate[4] = v512_32( sha256_iv[4] );
istate[5] = v512_32( sha256_iv[5] );
istate[6] = v512_32( sha256_iv[6] );
istate[7] = v512_32( sha256_iv[7] );
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
block[15] = v512_32( 32*8 ); // bit count
do
{
@@ -216,33 +216,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 80*8 );
vdata[16+15] = v256_32( 80*8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 );
block[15] = v256_32( 32*8 );
// initialize state for second hash
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
istate[0] = v256_32( sha256_iv[0] );
istate[1] = v256_32( sha256_iv[1] );
istate[2] = v256_32( sha256_iv[2] );
istate[3] = v256_32( sha256_iv[3] );
istate[4] = v256_32( sha256_iv[4] );
istate[5] = v256_32( sha256_iv[5] );
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -298,31 +298,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 );
vdata[16+15] = v128_32( 80*8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 );
block[15] = v128_32( 32*8 );
// initialize state
istate[0] = _mm_set1_epi32( sha256_iv[0] );
istate[1] = _mm_set1_epi32( sha256_iv[1] );
istate[2] = _mm_set1_epi32( sha256_iv[2] );
istate[3] = _mm_set1_epi32( sha256_iv[3] );
istate[4] = _mm_set1_epi32( sha256_iv[4] );
istate[5] = _mm_set1_epi32( sha256_iv[5] );
istate[6] = _mm_set1_epi32( sha256_iv[6] );
istate[7] = _mm_set1_epi32( sha256_iv[7] );
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );

View File

@@ -51,8 +51,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 3] = block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
@@ -128,10 +127,10 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -140,42 +139,42 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256dt_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
buf[15] = v512_32( 0x480 ); // sha256dt funky bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
istate[0] = v512_32( sha256dt_iv[0] );
istate[1] = v512_32( sha256dt_iv[1] );
istate[2] = v512_32( sha256dt_iv[2] );
istate[3] = v512_32( sha256dt_iv[3] );
istate[4] = v512_32( sha256dt_iv[4] );
istate[5] = v512_32( sha256dt_iv[5] );
istate[6] = v512_32( sha256dt_iv[6] );
istate[7] = v512_32( sha256dt_iv[7] );
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
block[15] = v512_32( 0x300 ); // bit count
do
{
@@ -226,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 0x480 );
vdata[16+15] = v256_32( 0x480 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 0x300 );
block[15] = v256_32( 0x300 );
// initialize state for swecond hash
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
// initialize state for second hash
istate[0] = v256_32( sha256dt_iv[0] );
istate[1] = v256_32( sha256dt_iv[1] );
istate[2] = v256_32( sha256dt_iv[2] );
istate[3] = v256_32( sha256dt_iv[3] );
istate[4] = v256_32( sha256dt_iv[4] );
istate[5] = v256_32( sha256dt_iv[5] );
istate[6] = v256_32( sha256dt_iv[6] );
istate[7] = v256_32( sha256dt_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -308,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 0x480 );
vdata[16+15] = v128_32( 0x480 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 0x300 );
block[15] = v128_32( 0x300 );
// initialize state
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
initstate[0] = v128_32( sha256dt_iv[0] );
initstate[1] = v128_32( sha256dt_iv[1] );
initstate[2] = v128_32( sha256dt_iv[2] );
initstate[3] = v128_32( sha256dt_iv[3] );
initstate[4] = v128_32( sha256dt_iv[4] );
initstate[5] = v128_32( sha256dt_iv[5] );
initstate[6] = v128_32( sha256dt_iv[6] );
initstate[7] = v128_32( sha256dt_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );
@@ -342,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( hash32, block, initstate );
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
// {
mm128_block_bswap_32( hash32, hash32 );
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
// }
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );

View File

@@ -30,10 +30,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i last_byte = v512_32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -42,42 +42,42 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
mstate1[0] = v512_32( phash[0] );
mstate1[1] = v512_32( phash[1] );
mstate1[2] = v512_32( phash[2] );
mstate1[3] = v512_32( phash[3] );
mstate1[4] = v512_32( phash[4] );
mstate1[5] = v512_32( phash[5] );
mstate1[6] = v512_32( phash[6] );
mstate1[7] = v512_32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[0] = v512_32( pdata[16] );
buf[1] = v512_32( pdata[17] );
buf[2] = v512_32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
istate[0] = v512_32( sha256_iv[0] );
istate[1] = v512_32( sha256_iv[1] );
istate[2] = v512_32( sha256_iv[2] );
istate[3] = v512_32( sha256_iv[3] );
istate[4] = v512_32( sha256_iv[4] );
istate[5] = v512_32( sha256_iv[5] );
istate[6] = v512_32( sha256_iv[6] );
istate[7] = v512_32( sha256_iv[7] );
// initialize padding for 2nd & 3rd sha256
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
block[15] = v512_32( 32*8 ); // bit count
do
{
@@ -222,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
vdata[i] = v256_32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
vdata[16+15] = v256_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
block[15] = v256_32( 32*8 ); // bit count
// initialize state
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = v256_32( sha256_iv[0] );
istate[1] = v256_32( sha256_iv[1] );
istate[2] = v256_32( sha256_iv[2] );
istate[3] = v256_32( sha256_iv[3] );
istate[4] = v256_32( sha256_iv[4] );
istate[5] = v256_32( sha256_iv[5] );
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
@@ -313,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
vdata[16+15] = v128_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 ); // bit count
block[15] = v128_32( 32*8 ); // bit count
// initialize state
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );

View File

@@ -39,57 +39,429 @@
/*
static const uit64_t H512[8] =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
*/
static const uint64_t K512[80] =
{
0x428A2F98D728AE22, 0x7137449123EF65CD,
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019,
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE,
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910,
0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493,
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#if defined(__AVX2__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension.
// Experimental. Not tested. Not reviewed. Compile tested only.
// Needs GCC-13 for compilation.
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
// Modelled after noloader sha256 implementation.
// It's not clear how SHA512 will be supported before AVX10 considering how
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
// until AVX10-256.
#if defined(__AVX512VL__)
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
#else
// Ugly workaround to make it work with AVX2
static const __m256i mask __attribute__ ((aligned (32)))
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
#define mm256_alignr_1x64( v1, v0 ) \
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
#endif
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) )
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
// Save initial state
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
// Save state
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
}
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
// Save initial state
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
// Save state
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -117,7 +489,7 @@ static const uint64_t K512[80] =
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
__m512i T0 = _mm512_add_epi64( v512_64( K512[i] ), W[ i ] ); \
__m512i T1 = BSG8W_5_1( E ); \
__m512i T2 = BSG8W_5_0( A ); \
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
@@ -155,14 +527,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
}
else
{
A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
E = _mm512_set1_epi64( 0x510E527FADE682D1 );
F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
A = v512_64( 0x6A09E667F3BCC908 );
B = v512_64( 0xBB67AE8584CAA73B );
C = v512_64( 0x3C6EF372FE94F82B );
D = v512_64( 0xA54FF53A5F1D36F1 );
E = v512_64( 0x510E527FADE682D1 );
F = v512_64( 0x9B05688C2B3E6C1F );
G = v512_64( 0x1F83D9ABFB41BD6B );
H = v512_64( 0x5BE0CD19137E2179 );
}
for ( i = 0; i < 80; i += 8 )
@@ -191,14 +563,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
r[0] = _mm512_add_epi64( A, v512_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm512_add_epi64( B, v512_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm512_add_epi64( C, v512_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm512_add_epi64( D, v512_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm512_add_epi64( E, v512_64( 0x510E527FADE682D1 ) );
r[5] = _mm512_add_epi64( F, v512_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm512_add_epi64( G, v512_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm512_add_epi64( H, v512_64( 0x5BE0CD19137E2179 ) );
}
}
@@ -243,7 +615,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -255,9 +627,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
v512_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
v512_64( sc->count << 3 ), shuff_bswap64 );
sha512_8way_round( sc, sc->buf, sc->val );
mm512_block_bswap_64( dst, sc->val );
@@ -295,7 +667,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -317,7 +689,7 @@ do { \
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -364,14 +736,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
}
else
{
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
A = v256_64( 0x6A09E667F3BCC908 );
B = v256_64( 0xBB67AE8584CAA73B );
C = v256_64( 0x3C6EF372FE94F82B );
D = v256_64( 0xA54FF53A5F1D36F1 );
E = v256_64( 0x510E527FADE682D1 );
F = v256_64( 0x9B05688C2B3E6C1F );
G = v256_64( 0x1F83D9ABFB41BD6B );
H = v256_64( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
@@ -405,14 +777,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
r[0] = _mm256_add_epi64( A, v256_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, v256_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, v256_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, v256_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, v256_64( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, v256_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, v256_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, v256_64( 0x5BE0CD19137E2179 ) );
}
}
@@ -457,7 +829,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -469,9 +841,9 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
v256_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
v256_64( sc->count << 3 ), shuff_bswap64 );
sha512_4way_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );

View File

@@ -5,11 +5,32 @@
#include "simd-utils.h"
#include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__)
// Experimental, untested
// Need to substitute for sph_sha512
typedef struct
{
uint64_t buf[128>>3];
uint64_t val[8];
uint64_t count;
} sha512_context __attribute__ ((aligned (64)));
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in );
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
typedef struct {
typedef struct
{
__m512i buf[128>>3];
__m512i val[8];
uint64_t count;
@@ -28,7 +49,8 @@ void sha512_8way_full( void *dst, const void *data, size_t len );
// SHA-512 4 way
typedef struct {
typedef struct
{
__m256i buf[128>>3];
__m256i val[8];
uint64_t count;

View File

@@ -16,14 +16,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
ctx->val[0] = v512_64( 0x22312194FC2BF72C );
ctx->val[1] = v512_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v512_64( 0x2393B86B6F53B151 );
ctx->val[3] = v512_64( 0x963877195940EABD );
ctx->val[4] = v512_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v512_64( 0xBE5E1E2553863992 );
ctx->val[6] = v512_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v512_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
@@ -43,7 +43,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
const __m512i eight = v512_64( 0x0000000800000000 );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
@@ -84,14 +84,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
ctx->val[0] = v256_64( 0x22312194FC2BF72C );
ctx->val[1] = v256_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v256_64( 0x2393B86B6F53B151 );
ctx->val[3] = v256_64( 0x963877195940EABD );
ctx->val[4] = v256_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v256_64( 0xBE5E1E2553863992 );
ctx->val[6] = v256_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v256_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
@@ -111,7 +111,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
const __m256i four = v256_64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(

File diff suppressed because it is too large Load Diff

View File

@@ -10,6 +10,33 @@
#define SPH_SIZE_shabal512 512
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
__m512i buf[16];
__m512i A[12], B[16], C[16];
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_16way_context __attribute__ ((aligned (64)));
typedef shabal_16way_context shabal256_16way_context;
typedef shabal_16way_context shabal512_16way_context;
void shabal256_16way_init( void *cc );
void shabal256_16way_update( void *cc, const void *data, size_t len );
void shabal256_16way_close( void *cc, void *dst );
void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_16way_init( void *cc );
void shabal512_16way_update( void *cc, const void *data, size_t len );
void shabal512_16way_close( void *cc, void *dst );
void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
#endif
#if defined(__AVX2__)
typedef struct {

View File

@@ -2,7 +2,7 @@
#include "c11-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -6,7 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -6,7 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -2,7 +2,7 @@
#include "x11-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -6,7 +6,7 @@
#include <string.h>
#include <stdint.h>
#include <compat/portable_endian.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"

View File

@@ -2,7 +2,7 @@
#include "x11gost-gate.h"
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -6,7 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -41,9 +41,9 @@ int hex_hash( void* output, const void* input, int thrid )
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, in, size );
blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
@@ -50,7 +50,7 @@ struct TortureGarden
sph_groestl512_context groestl;
sph_fugue512_context fugue;
#endif
sph_blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
@@ -80,9 +80,9 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
switch ( algo )
{
case 0:
sph_blake512_init(&garden->blake);
sph_blake512(&garden->blake, input, 64);
sph_blake512_close(&garden->blake, hash);
blake512_init(&garden->blake);
blake512_update(&garden->blake, input, 64);
blake512_close(&garden->blake, hash);
break;
case 1:
sph_bmw512_init(&garden->bmw);

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#include <stdint.h>
#include <unistd.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
@@ -30,7 +30,6 @@
#if defined (__AVX2__)
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
@@ -199,7 +198,7 @@ union _x16r_context_overlay
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;

View File

@@ -64,9 +64,9 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, in, size );
blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );
@@ -219,13 +219,13 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
mm128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
if ( !opt_quiet && !thr_id )
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
}
x16r_prehash( edata, pdata );

View File

@@ -21,7 +21,7 @@ union _x16rv2_context_overlay
sph_echo512_context echo;
sph_fugue512_context fugue;
#endif
sph_blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
@@ -58,9 +58,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
switch ( algo )
{
case BLAKE:
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, in, size );
sph_blake512_close( &ctx.blake, hash );
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, in, size );
blake512_close( &ctx.blake, hash );
break;
case BMW:
sph_bmw512_init( &ctx.bmw );

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
@@ -27,6 +27,431 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#if defined(X17_16X32)
union _x17_16way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_2buf_context cube;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
echo_4way_context echo;
#else
hashState_groestl groestl;
sph_shavite512_context shavite;
hashState_echo echo;
#endif
simd_4way_context simd;
hamsi512_8way_context hamsi;
// hamsi512_16x32_context hamsi;
hashState_fugue fugue;
shabal512_16way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_16way_context haval;
} __attribute__ ((aligned (64)));
typedef union _x17_16way_context_overlay x17_16way_context_overlay;
static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
int x17_16way_hash( void *state, const __m512i nonceA, const __m512i nonceB,
int thr_id )
{
uint64_t vhashA[8*16] __attribute__ ((aligned (128)));
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
uint64_t vhashC[8*4] __attribute__ ((aligned (64)));
uint64_t vhashD[8*4] __attribute__ ((aligned (64)));
uint64_t hash00[8] __attribute__ ((aligned (32)));
uint64_t hash01[8] __attribute__ ((aligned (32)));
uint64_t hash02[8] __attribute__ ((aligned (32)));
uint64_t hash03[8] __attribute__ ((aligned (32)));
uint64_t hash04[8] __attribute__ ((aligned (32)));
uint64_t hash05[8] __attribute__ ((aligned (32)));
uint64_t hash06[8] __attribute__ ((aligned (32)));
uint64_t hash07[8] __attribute__ ((aligned (32)));
uint64_t hash08[8] __attribute__ ((aligned (32)));
uint64_t hash09[8] __attribute__ ((aligned (32)));
uint64_t hash10[8] __attribute__ ((aligned (32)));
uint64_t hash11[8] __attribute__ ((aligned (32)));
uint64_t hash12[8] __attribute__ ((aligned (32)));
uint64_t hash13[8] __attribute__ ((aligned (32)));
uint64_t hash14[8] __attribute__ ((aligned (32)));
uint64_t hash15[8] __attribute__ ((aligned (32)));
x17_16way_context_overlay ctx;
memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) );
blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA,
x17_16way_midstate );
blake512_8way_final_le( &ctx.blake, vhashB, nonceB,
x17_16way_midstate );
bmw512_8way_full( &ctx.bmw, vhashA, vhashA, 64 );
bmw512_8way_full( &ctx.bmw, vhashB, vhashB, 64 );
#if defined(__VAES__)
rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
groestl512_4way_full( &ctx.groestl, vhashC, vhashC, 64 );
groestl512_4way_full( &ctx.groestl, vhashD, vhashD, 64 );
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
rintrlv_8x64_4x128( vhashC, vhashD, vhashB, 512 );
groestl512_4way_full( &ctx.groestl, vhashC, vhashC, 64 );
groestl512_4way_full( &ctx.groestl, vhashD, vhashD, 64 );
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
#else
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07, vhashA );
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashB );
groestl512_full( &ctx.groestl, (char*)hash00, (char*)hash00, 512 );
groestl512_full( &ctx.groestl, (char*)hash01, (char*)hash01, 512 );
groestl512_full( &ctx.groestl, (char*)hash02, (char*)hash02, 512 );
groestl512_full( &ctx.groestl, (char*)hash03, (char*)hash03, 512 );
groestl512_full( &ctx.groestl, (char*)hash04, (char*)hash04, 512 );
groestl512_full( &ctx.groestl, (char*)hash05, (char*)hash05, 512 );
groestl512_full( &ctx.groestl, (char*)hash06, (char*)hash06, 512 );
groestl512_full( &ctx.groestl, (char*)hash07, (char*)hash07, 512 );
groestl512_full( &ctx.groestl, (char*)hash08, (char*)hash08, 512 );
groestl512_full( &ctx.groestl, (char*)hash09, (char*)hash09, 512 );
groestl512_full( &ctx.groestl, (char*)hash10, (char*)hash10, 512 );
groestl512_full( &ctx.groestl, (char*)hash11, (char*)hash11, 512 );
groestl512_full( &ctx.groestl, (char*)hash12, (char*)hash12, 512 );
groestl512_full( &ctx.groestl, (char*)hash13, (char*)hash13, 512 );
groestl512_full( &ctx.groestl, (char*)hash14, (char*)hash14, 512 );
groestl512_full( &ctx.groestl, (char*)hash15, (char*)hash15, 512 );
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07 );
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
#endif
skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 );
skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhashA, 64 );
jh512_8way_close( &ctx.jh, vhashA );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhashB, 64 );
jh512_8way_close( &ctx.jh, vhashB );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhashA, 64 );
keccak512_8way_close( &ctx.keccak, vhashA );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhashB, 64 );
keccak512_8way_close( &ctx.keccak, vhashB );
//
rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
luffa512_4way_full( &ctx.luffa, vhashC, vhashC, 64 );
luffa512_4way_full( &ctx.luffa, vhashD, vhashD, 64 );
cube_4way_2buf_full( &ctx.cube, vhashC, vhashD, 512, vhashC, vhashD, 64 );
#if defined(__VAES__)
shavite512_4way_full( &ctx.shavite, vhashC, vhashC, 64 );
shavite512_4way_full( &ctx.shavite, vhashD, vhashD, 64 );
#else
dintrlv_4x128_512( hash00, hash01, hash02, hash03, vhashC );
dintrlv_4x128_512( hash04, hash05, hash06, hash07, vhashD );
shavite512_full( &ctx.shavite, hash00, hash00, 64 );
shavite512_full( &ctx.shavite, hash01, hash01, 64 );
shavite512_full( &ctx.shavite, hash02, hash02, 64 );
shavite512_full( &ctx.shavite, hash03, hash03, 64 );
shavite512_full( &ctx.shavite, hash04, hash04, 64 );
shavite512_full( &ctx.shavite, hash05, hash05, 64 );
shavite512_full( &ctx.shavite, hash06, hash06, 64 );
shavite512_full( &ctx.shavite, hash07, hash07, 64 );
intrlv_4x128_512( vhashC, hash00, hash01, hash02, hash03 );
intrlv_4x128_512( vhashD, hash04, hash05, hash06, hash07 );
#endif
simd512_4way_full( &ctx.simd, vhashC, vhashC, 64 );
simd512_4way_full( &ctx.simd, vhashD, vhashD, 64 );
#if defined(__VAES__)
echo_4way_full( &ctx.echo, vhashC, 512, vhashC, 64 );
echo_4way_full( &ctx.echo, vhashD, 512, vhashD, 64 );
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
#else
dintrlv_4x128_512( hash00, hash01, hash02, hash03, vhashC );
dintrlv_4x128_512( hash04, hash05, hash06, hash07, vhashD );
echo_full( &ctx.echo, (BitSequence *)hash00, 512,
(const BitSequence *)hash00, 64 );
echo_full( &ctx.echo, (BitSequence *)hash01, 512,
(const BitSequence *)hash01, 64 );
echo_full( &ctx.echo, (BitSequence *)hash02, 512,
(const BitSequence *)hash02, 64 );
echo_full( &ctx.echo, (BitSequence *)hash03, 512,
(const BitSequence *)hash03, 64 );
echo_full( &ctx.echo, (BitSequence *)hash04, 512,
(const BitSequence *)hash04, 64 );
echo_full( &ctx.echo, (BitSequence *)hash05, 512,
(const BitSequence *)hash05, 64 );
echo_full( &ctx.echo, (BitSequence *)hash06, 512,
(const BitSequence *)hash06, 64 );
echo_full( &ctx.echo, (BitSequence *)hash07, 512,
(const BitSequence *)hash07, 64 );
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07 );
#endif
//
rintrlv_8x64_4x128( vhashC, vhashD, vhashB, 512 );
luffa512_4way_full( &ctx.luffa, vhashC, vhashC, 64 );
luffa512_4way_full( &ctx.luffa, vhashD, vhashD, 64 );
cube_4way_2buf_full( &ctx.cube, vhashC, vhashD, 512, vhashC, vhashD, 64 );
#if defined(__VAES__)
shavite512_4way_full( &ctx.shavite, vhashC, vhashC, 64 );
shavite512_4way_full( &ctx.shavite, vhashD, vhashD, 64 );
#else
dintrlv_4x128_512( hash08, hash09, hash10, hash11, vhashC );
dintrlv_4x128_512( hash12, hash13, hash14, hash15, vhashD );
shavite512_full( &ctx.shavite, hash08, hash08, 64 );
shavite512_full( &ctx.shavite, hash09, hash09, 64 );
shavite512_full( &ctx.shavite, hash10, hash10, 64 );
shavite512_full( &ctx.shavite, hash11, hash11, 64 );
shavite512_full( &ctx.shavite, hash12, hash12, 64 );
shavite512_full( &ctx.shavite, hash13, hash13, 64 );
shavite512_full( &ctx.shavite, hash14, hash14, 64 );
shavite512_full( &ctx.shavite, hash15, hash15, 64 );
intrlv_4x128_512( vhashC, hash08, hash09, hash10, hash11 );
intrlv_4x128_512( vhashD, hash12, hash13, hash14, hash15 );
#endif
simd512_4way_full( &ctx.simd, vhashC, vhashC, 64 );
simd512_4way_full( &ctx.simd, vhashD, vhashD, 64 );
#if defined(__VAES__)
echo_4way_full( &ctx.echo, vhashC, 512, vhashC, 64 );
echo_4way_full( &ctx.echo, vhashD, 512, vhashD, 64 );
rintrlv_4x128_8x64( vhashB, vhashC, vhashD, 512 );
#else
dintrlv_4x128_512( hash08, hash09, hash10, hash11, vhashC );
dintrlv_4x128_512( hash12, hash13, hash14, hash15, vhashD );
echo_full( &ctx.echo, (BitSequence *)hash08, 512,
(const BitSequence *)hash08, 64 );
echo_full( &ctx.echo, (BitSequence *)hash09, 512,
(const BitSequence *)hash09, 64 );
echo_full( &ctx.echo, (BitSequence *)hash10, 512,
(const BitSequence *)hash10, 64 );
echo_full( &ctx.echo, (BitSequence *)hash11, 512,
(const BitSequence *)hash11, 64 );
echo_full( &ctx.echo, (BitSequence *)hash12, 512,
(const BitSequence *)hash12, 64 );
echo_full( &ctx.echo, (BitSequence *)hash13, 512,
(const BitSequence *)hash13, 64 );
echo_full( &ctx.echo, (BitSequence *)hash14, 512,
(const BitSequence *)hash14, 64 );
echo_full( &ctx.echo, (BitSequence *)hash15, 512,
(const BitSequence *)hash15, 64 );
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
#endif
//
/*
intrlv_16x32( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, 512 );
hamsi512_16x32_full( &ctx.hamsi, vhashA, vhashA, 64 );
dintrlv_16x32( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashA, 512 );
*/
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
hamsi512_8way_close( &ctx.hamsi, vhashA );
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07, vhashA );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhashB, 64 );
hamsi512_8way_close( &ctx.hamsi, vhashB );
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashB );
fugue512_full( &ctx.fugue, hash00, hash00, 64 );
fugue512_full( &ctx.fugue, hash01, hash01, 64 );
fugue512_full( &ctx.fugue, hash02, hash02, 64 );
fugue512_full( &ctx.fugue, hash03, hash03, 64 );
fugue512_full( &ctx.fugue, hash04, hash04, 64 );
fugue512_full( &ctx.fugue, hash05, hash05, 64 );
fugue512_full( &ctx.fugue, hash06, hash06, 64 );
fugue512_full( &ctx.fugue, hash07, hash07, 64 );
fugue512_full( &ctx.fugue, hash08, hash08, 64 );
fugue512_full( &ctx.fugue, hash09, hash09, 64 );
fugue512_full( &ctx.fugue, hash10, hash10, 64 );
fugue512_full( &ctx.fugue, hash11, hash11, 64 );
fugue512_full( &ctx.fugue, hash12, hash12, 64 );
fugue512_full( &ctx.fugue, hash13, hash13, 64 );
fugue512_full( &ctx.fugue, hash14, hash14, 64 );
fugue512_full( &ctx.fugue, hash15, hash15, 64 );
intrlv_16x32_512( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
shabal512_16way_init( &ctx.shabal );
shabal512_16way_update( &ctx.shabal, vhashA, 64 );
shabal512_16way_close( &ctx.shabal, vhashA );
dintrlv_16x32_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashA );
sph_whirlpool512_full( &ctx.whirlpool, hash00, hash00, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash01, hash01, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash02, hash02, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash03, hash03, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash04, hash04, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash05, hash05, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash06, hash06, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash07, hash07, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash08, hash08, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash09, hash09, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash10, hash10, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash11, hash11, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash12, hash12, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash13, hash13, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash14, hash14, 64 );
sph_whirlpool512_full( &ctx.whirlpool, hash15, hash15, 64 );
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07 );
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhashA, 64 );
sha512_8way_close( &ctx.sha512, vhashA );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhashB, 64 );
sha512_8way_close( &ctx.sha512, vhashB );
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07, vhashA );
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashB );
intrlv_16x32_512( vhashA, hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
haval256_5_16way_init( &ctx.haval );
haval256_5_16way_update( &ctx.haval, vhashA, 64 );
haval256_5_16way_close( &ctx.haval, state );
return 1;
}
int scanhash_x17_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t *hash32_d7 = &(hash32[7*16]);
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i nonceA, nonceB;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = v512_64( 16 );
const bool bench = opt_benchmark;
// convert LE32 to LE64
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ),
_mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) );
nonceB = _mm512_add_epi32( nonceA, v512_64( 8 ) );
do
{
if ( likely( x17_16way_hash( hash32, nonceA, nonceB, thr_id ) ) )
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
nonceA = _mm512_add_epi32( nonceA, sixteen );
nonceB = _mm512_add_epi32( nonceB, sixteen );
n += 16;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(X17_8WAY)
union _x17_8way_context_overlay

View File

@@ -2,7 +2,10 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_8WAY)
#if defined (X17_16X32)
gate->scanhash = (void*)&scanhash_x17_16way;
// gate->hash = (void*)&x17_16way_hash;
#elif defined (X17_8WAY)
gate->scanhash = (void*)&scanhash_x17_8way;
gate->hash = (void*)&x17_8way_hash;
#elif defined (X17_4WAY)

View File

@@ -6,13 +6,21 @@
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X17_8WAY 1
#define X17_16X32 1
#elif defined(__AVX2__) && defined(__AES__)
#define X17_4WAY 1
#define X17_8X32 1
#endif
bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_8WAY)
#if defined(X17_8WAY) || defined(X17_16X32)
int scanhash_x17_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//int x17_16way_hash( void *state, const void *input, int thr_id );
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -6,7 +6,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
@@ -32,8 +32,8 @@
union _x17_context_overlay
{
sph_blake512_context blake;
sph_bmw512_context bmw;
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
@@ -60,13 +60,10 @@ typedef union _x17_context_overlay x17_context_overlay;
int x17_hash(void *output, const void *input, int thr_id )
{
// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
unsigned char hash[64] __attribute__((aligned(64)));
x17_context_overlay ctx;
sph_blake512_init(&ctx.blake);
sph_blake512(&ctx.blake, input, 80);
sph_blake512_close(&ctx.blake, hash);
blake512_full( &ctx.blake, hash, input, 80 );
sph_bmw512_init(&ctx.bmw);
sph_bmw512(&ctx.bmw, (const void*) hash, 64);
@@ -95,19 +92,15 @@ int x17_hash(void *output, const void *input, int thr_id )
luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehash_full( &ctx.cube, (byte*) hash, 512, (const byte*)hash, 64 );
// 9 Shavite
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
// 10 Simd
simd_full( &ctx.simd, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
//11---echo---
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,
(const BitSequence *)hash, 64 );
@@ -117,25 +110,20 @@ int x17_hash(void *output, const void *input, int thr_id )
sph_echo512_close( &ctx.echo, hash );
#endif
// X13 algos
// 12 Hamsi
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hash, 64 );
sph_hamsi512_close( &ctx.hamsi, hash );
// 13 Fugue
#if defined(__AES__)
fugue512_full( &ctx.fugue, hash, hash, 64 );
#else
sph_fugue512_full( &ctx.fugue, hash, hash, 64 );
#endif
// X14 Shabal
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close( &ctx.shabal, hash );
// X15 Whirlpool
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash );

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/jh/jh-hash-4way.h"

View File

@@ -1,5 +1,5 @@
#include "x22i-gate.h"
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"

View File

@@ -2,7 +2,7 @@
#if !( defined(X22I_8WAY) || defined(X22I_4WAY) )
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -33,8 +33,8 @@
union _x22i_context_overlay
{
sph_blake512_context blake;
sph_bmw512_context bmw;
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
@@ -67,9 +67,9 @@ int x22i_hash( void *output, const void *input, int thrid )
unsigned char hash2[65] __attribute__((aligned(64))) = {0};
x22i_context_overlay ctx;
sph_blake512_init(&ctx.blake);
sph_blake512(&ctx.blake, input, 80);
sph_blake512_close(&ctx.blake, hash);
blake512_init(&ctx.blake);
blake512_update(&ctx.blake, input, 80);
blake512_close(&ctx.blake, hash);
sph_bmw512_init(&ctx.bmw);
sph_bmw512(&ctx.bmw, (const void*) hash, 64);

View File

@@ -1,5 +1,5 @@
#include "x22i-gate.h"
#include "algo/blake/blake-hash-4way.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
@@ -9,7 +9,7 @@
#include "algo/sha/sha512-hash.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/blake/blake2s-hash-4way.h"
#include "algo/blake/blake2s-hash.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/luffa/luffa_for_sse2.h"

View File

@@ -2,7 +2,7 @@
#if !( defined(X25X_8WAY) || defined(X25X_4WAY) )
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -35,7 +35,7 @@
union _x25x_context_overlay
{
sph_blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
@@ -71,9 +71,9 @@ int x25x_hash( void *output, const void *input, int thrid )
unsigned char hash[25][64] __attribute__((aligned(64))) = {0};
x25x_context_overlay ctx;
sph_blake512_init(&ctx.blake);
sph_blake512(&ctx.blake, input, 80);
sph_blake512_close(&ctx.blake, &hash[0] );
blake512_init(&ctx.blake);
blake512_update(&ctx.blake, input, 80);
blake512_close(&ctx.blake, &hash[0] );
sph_bmw512_init(&ctx.bmw);
sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);

View File

@@ -5,6 +5,6 @@
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
make -j 4
strip -s cpuminer

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.2.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.3.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.23.2'
PACKAGE_STRING='cpuminer-opt 3.23.2'
PACKAGE_VERSION='3.23.3'
PACKAGE_STRING='cpuminer-opt 3.23.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.23.2
cpuminer-opt configure 3.23.3
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.23.2, which was
It was created by cpuminer-opt $as_me 3.23.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.23.2'
VERSION='3.23.3'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.23.2, which was
This file was extended by cpuminer-opt $as_me 3.23.3, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 3.23.2
cpuminer-opt config.status 3.23.3
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.23.2])
AC_INIT([cpuminer-opt], [3.23.3])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.2.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.3.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.23.2'
PACKAGE_STRING='cpuminer-opt 3.23.2'
PACKAGE_VERSION='3.23.3'
PACKAGE_STRING='cpuminer-opt 3.23.3'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.23.2
cpuminer-opt configure 3.23.3
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.23.2, which was
It was created by cpuminer-opt $as_me 3.23.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.23.2'
VERSION='3.23.3'
cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.23.2, which was
This file was extended by cpuminer-opt $as_me 3.23.3, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.23.2
cpuminer-opt config.status 3.23.3
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1586,39 +1586,43 @@ start:
else
new_work = false;
if ( new_work && !opt_quiet )
if ( new_work )
{
double miner_hr = 0.;
double net_hr = net_hashrate;
double nd = net_diff * exp32;
char net_hr_units[4] = {0};
char miner_hr_units[4] = {0};
char net_ttf[32];
char miner_ttf[32];
if ( !opt_quiet )
{
double miner_hr = 0.;
double net_hr = net_hashrate;
double nd = net_diff * exp32;
char net_hr_units[4] = {0};
char miner_hr_units[4] = {0};
char net_ttf[32];
char miner_ttf[32];
pthread_mutex_lock( &stats_lock );
pthread_mutex_lock( &stats_lock );
for ( int i = 0; i < opt_n_threads; i++ )
miner_hr += thr_hashrates[i];
global_hashrate = miner_hr;
for ( int i = 0; i < opt_n_threads; i++ )
miner_hr += thr_hashrates[i];
global_hashrate = miner_hr;
pthread_mutex_unlock( &stats_lock );
pthread_mutex_unlock( &stats_lock );
if ( net_hr > 0. )
sprintf_et( net_ttf, nd / net_hr );
else
sprintf( net_ttf, "NA" );
if ( miner_hr > 0. )
sprintf_et( miner_ttf, nd / miner_hr );
else
sprintf( miner_ttf, "NA" );
if ( net_hr > 0. )
sprintf_et( net_ttf, nd / net_hr );
else
sprintf( net_ttf, "NA" );
if ( miner_hr > 0. )
sprintf_et( miner_ttf, nd / miner_hr );
else
sprintf( miner_ttf, "NA" );
scale_hash_for_display ( &miner_hr, miner_hr_units );
scale_hash_for_display ( &net_hr, net_hr_units );
applog2( LOG_INFO,
scale_hash_for_display ( &miner_hr, miner_hr_units );
scale_hash_for_display ( &net_hr, net_hr_units );
applog2( LOG_INFO,
"Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
miner_hr, miner_hr_units, miner_ttf, net_hr,
net_hr_units, net_ttf );
}
restart_threads();
}
} // rc
@@ -1872,10 +1876,14 @@ bool submit_solution( struct work *work, const void *hash,
uint32_t* t = (uint32_t*)work->target;
uint32_t* d = (uint32_t*)work->data;
applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x",
d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x",
d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x",
h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x",
t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
}
}
return true;
@@ -2226,7 +2234,7 @@ static void *miner_thread( void *userdata )
goto out;
}
g_work_time = time(NULL);
restart_threads();
// restart_threads();
}
pthread_rwlock_unlock( &g_work_lock );
@@ -2823,44 +2831,45 @@ static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
bool cpu_has_sse2 = has_sse2();
bool cpu_has_aes = has_aes_ni();
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_sha = has_sha();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_vaes = has_vaes();
bool cpu_has_avx10 = has_avx10();
bool sw_has_aes = false;
bool cpu_has_aes = has_aes_ni();
bool cpu_has_vaes = has_vaes();
bool cpu_has_sha = has_sha();
bool cpu_has_sha512 = has_sha512();
bool sw_has_sse2 = false;
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_sha = false;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_sha = false;
bool sw_has_sha512 = false;
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool use_aes;
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_sha;
bool use_aes;
bool use_vaes;
bool use_sha;
bool use_sha512;
bool use_none;
#ifdef __AES__
sw_has_aes = true;
#endif
#ifdef __SSE2__
sw_has_sse2 = true;
#endif
@@ -2876,17 +2885,18 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true;
#endif
#ifdef __SHA__
sw_has_sha = true;
#ifdef __AES__
sw_has_aes = true;
#endif
#ifdef __VAES__
sw_has_vaes = true;
#endif
// #if !((__AES__) || (__SSE2__))
// printf("Neither __AES__ nor __SSE2__ defined.\n");
// #endif
#ifdef __SHA__
sw_has_sha = true;
#endif
#ifdef __SHA512__
sw_has_sha512 = true;
#endif
cpu_brand_string( cpu_brand );
printf( "CPU: %s\n", cpu_brand );
@@ -2909,7 +2919,8 @@ static bool cpu_capability( bool display_only )
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha ) printf( " SHA" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha ) printf( " SHA" );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
avx10_version(), avx10_vector_length() );
@@ -2921,7 +2932,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" );
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha ) printf( " SHA" );
if ( !display_only )
{
@@ -2935,7 +2947,8 @@ static bool cpu_capability( bool display_only )
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
if ( algo_has_sha512 ) printf( " SHA512" );
else if ( algo_has_sha ) printf( " SHA" );
}
}
printf("\n");
@@ -2970,16 +2983,17 @@ static bool cpu_capability( bool display_only )
}
// Determine mining options
use_sse2 = cpu_has_sse2 && algo_has_sse2;
use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes;
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
|| use_avx2 || use_sha || use_vaes );
|| use_avx2 || use_sha || use_vaes || use_sha512 );
// Display best options
printf( "\nStarting miner with" );
@@ -2993,7 +3007,8 @@ static bool cpu_capability( bool display_only )
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );
if ( use_sha ) printf( " SHA" );
if ( use_sha512 ) printf( " SHA512" );
else if ( use_sha ) printf( " SHA" );
}
printf( "...\n\n" );

View File

@@ -738,10 +738,10 @@ static inline void extr_lane_8x32( void *d, const void *s,
// Combine byte swap & broadcast in one permute
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
const __m256i c0 = v256_32( 0x00010203 );
const __m256i c1 = v256_32( 0x04050607 );
const __m256i c2 = v256_32( 0x08090a0b );
const __m256i c3 = v256_32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -796,7 +796,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi32( 1 );
const __m256i c1 = v256_32( 1 );
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1244,10 +1244,10 @@ static inline void extr_lane_16x32( void *d, const void *s,
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const __m512i c0 = _mm512_set1_epi32( 0x00010203 );
const __m512i c1 = _mm512_set1_epi32( 0x04050607 );
const __m512i c2 = _mm512_set1_epi32( 0x08090a0b );
const __m512i c3 = _mm512_set1_epi32( 0x0c0d0e0f );
const __m512i c0 = v512_32( 0x00010203 );
const __m512i c1 = v512_32( 0x04050607 );
const __m512i c2 = v512_32( 0x08090a0b );
const __m512i c3 = v512_32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -1302,7 +1302,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi32( 1 );
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
__m128i s0 = casti_m128i( src,0 );
@@ -1566,8 +1566,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
const __m256i c0 = v256_64( 0x0405060700010203 );
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -1958,16 +1958,16 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
__m512i *d = (__m512i*)dst;
const uint64_t *s = (const uint64_t*)src;
d[0] = _mm512_set1_epi64( s[0] );
d[1] = _mm512_set1_epi64( s[1] );
d[2] = _mm512_set1_epi64( s[2] );
d[3] = _mm512_set1_epi64( s[3] );
d[4] = _mm512_set1_epi64( s[4] );
d[5] = _mm512_set1_epi64( s[5] );
d[6] = _mm512_set1_epi64( s[6] );
d[7] = _mm512_set1_epi64( s[7] );
d[8] = _mm512_set1_epi64( s[8] );
d[9] = _mm512_set1_epi64( s[9] );
d[0] = v512_64( s[0] );
d[1] = v512_64( s[1] );
d[2] = v512_64( s[2] );
d[3] = v512_64( s[3] );
d[4] = v512_64( s[4] );
d[5] = v512_64( s[5] );
d[6] = v512_64( s[6] );
d[7] = v512_64( s[7] );
d[8] = v512_64( s[8] );
d[9] = v512_64( s[9] );
}
// byte swap and broadcast to all lanes
@@ -1977,8 +1977,8 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m512i c0 = _mm512_set1_epi64( 0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi64( 0x0c0d0e0f08090a0b );
const __m512i c0 = v512_64( 0x0405060700010203 );
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
@@ -2013,7 +2013,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = _mm512_set1_epi64( 1 );
const __m512i c1 = v512_64( 1 );
__m128i s0 = casti_m128i( src,0 );
__m128i s1 = casti_m128i( src,1 );
__m128i s2 = casti_m128i( src,2 );

View File

@@ -3,7 +3,7 @@
#if defined(__SSE2__)
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//
// 128 bit SSE vectors
//
@@ -20,9 +20,9 @@
// define a local const for repeated references to the same constant.
//
// One common use for simd constants is as a control index for vector
// instructions like blend and shuffle. Alhough the ultimate instruction
// may execute in a single clock cycle, generating the control index adds
// several more cycles to the entire operation.
// shuffle instructions. Alhough the ultimate instruction may execute in a
// single clock cycle, generating the control index adds several more cycles
// to the entire operation.
//
// All of the utilities here assume all data is in registers except
// in rare cases where arguments are pointers.
@@ -32,7 +32,7 @@
// Intrinsics automatically promote from REX to VEX when AVX is available
// but ASM needs to be done manually.
//
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Used instead if casting.
@@ -43,8 +43,8 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_64(i) _mm_set1_epi64x(i)
#define v128_32(i) _mm_set1_epi32(i)
#define v128_64(i64) _mm_set1_epi64x(i64)
#define v128_32(i32) _mm_set1_epi32(i32)
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
@@ -81,8 +81,6 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
//#define m128_one_64 _mm_set1_epi64x( 1 )
#define m128_one_32 _mm_set1_epi32( 1 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -100,7 +98,7 @@ static inline __m128i mm128_neg1_fn()
#if defined(__SSE4_1__)
/////////////////////////////
/////////////////////////////////////////////////////////////
//
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
//

View File

@@ -90,10 +90,7 @@ typedef union
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
//#define m256_one_256 mm256_mov64_256( 1 )
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
#define m256_one_64 _mm256_set1_epi64x( 1 )
#define m256_one_32 _mm256_set1_epi32( 1 )
static inline __m256i mm256_neg1_fn()
{

View File

@@ -97,8 +97,8 @@ typedef union
uint64_t u64[8];
} __attribute__ ((aligned (64))) m512_ovly;
#define v512_64(i) _mm512_set1_epi64(i)
#define v512_32(i) _mm512_set1_epi32(i)
#define v512_64(i64) _mm512_set1_epi64(i64)
#define v512_32(i32) _mm512_set1_epi32(i32)
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
@@ -118,9 +118,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
// Deprecated
#define m512_one_64 _mm512_set1_epi64( 1 )
#define m512_one_32 _mm512_set1_epi32( 1 )
// use asm to avoid compiler warning for unitialized local
static inline __m512i mm512_neg1_fn()

View File

@@ -516,6 +516,17 @@ static inline bool has_sha()
#endif
}
static inline bool has_sha512()
{
#ifdef __arm__
return false;
#else
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EAX_Reg ] & SHA512_Flag;
#endif
}
static inline bool has_sse2()
{
#ifdef __arm__