mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
bc5a5c6df8 |
12
Makefile.am
12
Makefile.am
@@ -36,21 +36,17 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake256-hash-4way.c \
|
||||
algo/blake/blake512-hash-4way.c \
|
||||
algo/blake/blake256-hash.c \
|
||||
algo/blake/blake512-hash.c \
|
||||
algo/blake/blake-gate.c \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/sph-blake2s.c \
|
||||
algo/blake/blake2s-hash-4way.c \
|
||||
algo/blake/blake2s-hash.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blake2s-gate.c \
|
||||
algo/blake/blake2s-4way.c \
|
||||
algo/blake/blake2b-hash-4way.c \
|
||||
algo/blake/blake2b-hash.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2b-gate.c \
|
||||
algo/blake/blake2b-4way.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
|
@@ -65,6 +65,14 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.23.3
|
||||
|
||||
#400: Removed excessive thread restarts when mining solo.
|
||||
Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS.
|
||||
Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension.
|
||||
Added prototype of sha-512 using SHA512 intrinsics, untested.
|
||||
Other improvements and code cleanup.
|
||||
|
||||
v3.23.2
|
||||
|
||||
sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
|
||||
|
@@ -97,6 +97,7 @@ typedef uint32_t set_t;
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (deprecated)
|
||||
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake, Zen3
|
||||
#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "blake-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
#ifndef BLAKE_HASH_4WAY__
|
||||
#define BLAKE_HASH_4WAY__ 1
|
||||
#ifndef BLAKE256_HASH__
|
||||
#define BLAKE256_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
@@ -11,13 +11,6 @@
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2
|
||||
|
||||
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
@@ -86,30 +79,6 @@ void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
@@ -137,7 +106,6 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
@@ -150,33 +118,7 @@ void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
#endif // BLAKE256_HASH_H__
|
@@ -1,113 +0,0 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,20 +0,0 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,34 +0,0 @@
|
||||
#ifndef __BLAKE2B_GATE_H__
|
||||
#define __BLAKE2B_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2b_hash( void *state, const void *input );
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -31,7 +31,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -252,17 +252,17 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v512_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v512_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v512_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v512_64( 0x510E527FADE682D1 );
|
||||
v[13] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
v[12] = _mm512_xor_si512( v[12], v512_64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], v512_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v512_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v512_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v512_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v512_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], v512_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
@@ -419,17 +419,17 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v256_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v256_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v256_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v256_64( 0x510E527FADE682D1 );
|
||||
v[13] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
v[12] = _mm256_xor_si256( v[12], v256_64( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], v256_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm256_not( v[14] );
|
||||
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v256_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v256_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v256_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v256_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], v256_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
@@ -1,64 +1,175 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#define MIDLEN 76
|
||||
#define A 64
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint8_t _ALIGN(A) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(A) vhashcpu[8];
|
||||
uint32_t _ALIGN(A) endiandata[20];
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
{
|
||||
uint8_t _ALIGN(32) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (32)));
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
do {
|
||||
endiandata[19] = n;
|
||||
blake2b_hash( hash64, endiandata );
|
||||
if ( unlikely( valid_hash( hash64, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,170 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,23 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,46 +0,0 @@
|
||||
#ifndef __BLAKE2S_GATE_H__
|
||||
#define __BLAKE2S_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -11,7 +11,7 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include "blake2s-hash.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -62,23 +62,23 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
|
||||
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v128_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v128_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v128_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v128_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v128_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
|
||||
v128_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
|
||||
v128_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
|
||||
v128_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
|
||||
v128_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
@@ -269,35 +269,35 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
v[ 8] = v256_32( blake2s_IV[0] );
|
||||
v[ 9] = v256_32( blake2s_IV[1] );
|
||||
v[10] = v256_32( blake2s_IV[2] );
|
||||
v[11] = v256_32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
@@ -391,24 +391,24 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v256_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v256_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v256_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v256_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v256_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], v256_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( v512_32( S->t[0] ),
|
||||
v512_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm512_xor_si512( v512_32( S->t[1] ),
|
||||
v512_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm512_xor_si512( v512_32( S->f[0] ),
|
||||
v512_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm512_xor_si512( v512_32( S->f[1] ),
|
||||
v512_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
@@ -589,20 +589,20 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v512_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v512_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v512_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v512_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], v512_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
@@ -1,75 +1,252 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "blake2s-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#elif defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static __thread blake2s_state blake2s_ctx;
|
||||
//static __thread blake2s_state s_ctx;
|
||||
#define MIDLEN 76
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (64)));
|
||||
|
||||
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
|
||||
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
|
||||
// blake2s_update(&ctx, input, 80);
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
/*
|
||||
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
|
||||
int scanhash_blake2s( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
s_ctx.buflen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(32) hash32[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
endiandata[19] = n;
|
||||
blake2s_hash( hash32, endiandata );
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,14 +1,12 @@
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake512-hash.h"
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const uint64_t IV512[8] =
|
||||
|
||||
static const uint64_t BLAKE512_IV[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
@@ -16,6 +14,7 @@ static const uint64_t IV512[8] =
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
/*
|
||||
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
@@ -36,8 +35,10 @@ static const unsigned sigma[16][16] = {
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
static const uint64_t CB[16] = {
|
||||
*/
|
||||
/*
|
||||
static const uint64_t CB[16] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x243F6A8885A308D3, 0x13198A2E03707344,
|
||||
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
|
||||
0x452821E638D01377, 0xBE5466CF34E90C6C,
|
||||
@@ -46,7 +47,7 @@ static const uint64_t CB[16] = {
|
||||
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
|
||||
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
|
||||
0x0801F2E2858EFC16, 0x636920D871574E69
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
#define Z00 0
|
||||
@@ -244,6 +245,290 @@ static const uint64_t CB[16] = {
|
||||
#define CBE 0x0801F2E2858EFC16
|
||||
#define CBF 0x636920D871574E69
|
||||
|
||||
// Blake-512 1 way AVX2 & SSE2
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define BLAKE512_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CBx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CBx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CBx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CBx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CBx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CBx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
|
||||
V0 = mm256_shufll_64( V0 ); \
|
||||
V3 = mm256_swap_128( V3 ); \
|
||||
V2 = mm256_shuflr_64( V2 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, D ) ^ Mx( r, C ), \
|
||||
CBx( r, B ) ^ Mx( r, A ), \
|
||||
CBx( r, 9 ) ^ Mx( r, 8 ), \
|
||||
CBx( r, F ) ^ Mx( r, E ) ) ) ); \
|
||||
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, C ) ^ Mx( r, D ), \
|
||||
CBx( r, A ) ^ Mx( r, B ), \
|
||||
CBx( r, 8 ) ^ Mx( r, 9 ), \
|
||||
CBx( r, E ) ^ Mx( r, F ) ) ) ); \
|
||||
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
|
||||
V0 = mm256_shuflr_64( V0 ); \
|
||||
V3 = mm256_swap_128( V3 ); \
|
||||
V2 = mm256_shufll_64( V2 ); \
|
||||
}
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
||||
const uint64_t T1 )
|
||||
{
|
||||
__m256i V0, V1, V2, V3;
|
||||
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
|
||||
V0 = casti_m256i( H, 0 );
|
||||
V1 = casti_m256i( H, 1 );
|
||||
V2 = _mm256_set_epi64x( CB3, CB2, CB1, CB0 );
|
||||
V3 = _mm256_set_epi64x( T1 ^ CB7, T1 ^ CB6, T0 ^ CB5, T0 ^ CB4 );
|
||||
|
||||
M0 = bswap_64( buf[ 0] );
|
||||
M1 = bswap_64( buf[ 1] );
|
||||
M2 = bswap_64( buf[ 2] );
|
||||
M3 = bswap_64( buf[ 3] );
|
||||
M4 = bswap_64( buf[ 4] );
|
||||
M5 = bswap_64( buf[ 5] );
|
||||
M6 = bswap_64( buf[ 6] );
|
||||
M7 = bswap_64( buf[ 7] );
|
||||
M8 = bswap_64( buf[ 8] );
|
||||
M9 = bswap_64( buf[ 9] );
|
||||
MA = bswap_64( buf[10] );
|
||||
MB = bswap_64( buf[11] );
|
||||
MC = bswap_64( buf[12] );
|
||||
MD = bswap_64( buf[13] );
|
||||
ME = bswap_64( buf[14] );
|
||||
MF = bswap_64( buf[15] );
|
||||
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
BLAKE512_ROUND( 6 );
|
||||
BLAKE512_ROUND( 7 );
|
||||
BLAKE512_ROUND( 8 );
|
||||
BLAKE512_ROUND( 9 );
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
|
||||
casti_m256i( H, 0 ) = mm256_xor3( casti_m256i( H, 0 ), V0, V2 );
|
||||
casti_m256i( H, 1 ) = mm256_xor3( casti_m256i( H, 1 ), V1, V3 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
|
||||
}
|
||||
|
||||
#define BLAKE512_ROUND( R ) \
|
||||
{ \
|
||||
__m128i V32, V23, V67, V76; \
|
||||
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
|
||||
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
|
||||
V[2] = mm128_alignr_64( V32, V23, 1 ); \
|
||||
V[3] = mm128_alignr_64( V23, V32, 1 ); \
|
||||
V[6] = mm128_alignr_64( V76, V67, 1 ); \
|
||||
V[7] = mm128_alignr_64( V67, V76, 1 ); \
|
||||
}
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 )
|
||||
{
|
||||
__m128i V[8];
|
||||
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
|
||||
V[0] = casti_m128i( H, 0 );
|
||||
V[1] = casti_m128i( H, 1 );
|
||||
V[2] = casti_m128i( H, 2 );
|
||||
V[3] = casti_m128i( H, 3 );
|
||||
V[4] = _mm_set_epi64x( CB1, CB0 );
|
||||
V[5] = _mm_set_epi64x( CB3, CB2 );
|
||||
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
|
||||
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
|
||||
|
||||
M0 = bswap_64( buf[ 0] );
|
||||
M1 = bswap_64( buf[ 1] );
|
||||
M2 = bswap_64( buf[ 2] );
|
||||
M3 = bswap_64( buf[ 3] );
|
||||
M4 = bswap_64( buf[ 4] );
|
||||
M5 = bswap_64( buf[ 5] );
|
||||
M6 = bswap_64( buf[ 6] );
|
||||
M7 = bswap_64( buf[ 7] );
|
||||
M8 = bswap_64( buf[ 8] );
|
||||
M9 = bswap_64( buf[ 9] );
|
||||
MA = bswap_64( buf[10] );
|
||||
MB = bswap_64( buf[11] );
|
||||
MC = bswap_64( buf[12] );
|
||||
MD = bswap_64( buf[13] );
|
||||
ME = bswap_64( buf[14] );
|
||||
MF = bswap_64( buf[15] );
|
||||
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
BLAKE512_ROUND( 6 );
|
||||
BLAKE512_ROUND( 7 );
|
||||
BLAKE512_ROUND( 8 );
|
||||
BLAKE512_ROUND( 9 );
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
|
||||
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
|
||||
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void blake512_init( blake512_context *sc )
|
||||
{
|
||||
memcpy( sc->H, BLAKE512_IV, 8 * sizeof(uint64_t) );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
void blake512_update(blake512_context *sc, const void *data, size_t len)
|
||||
{
|
||||
if ( len < (sizeof sc->buf) - sc->ptr )
|
||||
{
|
||||
memcpy( sc->buf + sc->ptr, data, len );
|
||||
sc->ptr += len;
|
||||
return;
|
||||
}
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - sc->ptr;
|
||||
if ( clen > len ) clen = len;
|
||||
memcpy( sc->buf + sc->ptr, data, clen );
|
||||
sc->ptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
len -= clen;
|
||||
if ( sc->ptr == sizeof sc->buf )
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 += 1;
|
||||
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blake512_close( blake512_context *sc, void *dst )
|
||||
{
|
||||
unsigned char buf[128] __attribute__((aligned(32)));
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr] = 0x80;
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFF;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00 + bit_len;
|
||||
sc->T1 -= 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
if ( bit_len <= 894 )
|
||||
{
|
||||
memset( buf + ptr + 1, 0, 111 - ptr );
|
||||
buf[111] |= 1;
|
||||
*((uint64_t*)(buf + 112)) = bswap_64( th );
|
||||
*((uint64_t*)(buf + 120)) = bswap_64( tl );
|
||||
blake512_update( sc, buf + ptr, 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset( buf + ptr + 1, 0, 127 - ptr );
|
||||
blake512_update( sc, buf + ptr, 128 - ptr );
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFF;
|
||||
memset( buf, 0, 112 );
|
||||
buf[111] = 1;
|
||||
*(uint64_t*)(buf + 112) = bswap_64( th );
|
||||
*(uint64_t*)(buf + 120) = bswap_64( tl );
|
||||
blake512_update( sc, buf, 128 );
|
||||
}
|
||||
|
||||
for ( k = 0; k < 8; k ++ )
|
||||
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
blake512_init( sc );
|
||||
blake512_update( sc, data, len );
|
||||
blake512_close( sc, dst );
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
@@ -277,12 +562,12 @@ static const uint64_t CB[16] = {
|
||||
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
v512_64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
|
||||
v512_64( c0 ), m1 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
@@ -318,14 +603,14 @@ static const uint64_t CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_set1_epi64( CB0 ); \
|
||||
V9 = _mm512_set1_epi64( CB1 ); \
|
||||
VA = _mm512_set1_epi64( CB2 ); \
|
||||
VB = _mm512_set1_epi64( CB3 ); \
|
||||
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
|
||||
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
|
||||
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
|
||||
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
|
||||
V8 = v512_64( CB0 ); \
|
||||
V9 = v512_64( CB1 ); \
|
||||
VA = v512_64( CB2 ); \
|
||||
VB = v512_64( CB3 ); \
|
||||
VC = v512_64( T0 ^ CB4 ); \
|
||||
VD = v512_64( T0 ^ CB5 ); \
|
||||
VE = v512_64( T1 ^ CB6 ); \
|
||||
VF = v512_64( T1 ^ CB7 ); \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
@@ -385,14 +670,14 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( sc->T0 ^ CB4 );
|
||||
VD = v512_64( sc->T0 ^ CB5 );
|
||||
VE = v512_64( sc->T1 ^ CB6 );
|
||||
VF = v512_64( sc->T1 ^ CB7 );
|
||||
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
@@ -457,14 +742,14 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( sc->T0 ^ CB4 );
|
||||
VD = v512_64( sc->T0 ^ CB5 );
|
||||
VE = v512_64( sc->T1 ^ CB6 );
|
||||
VF = v512_64( sc->T1 ^ CB7 );
|
||||
|
||||
M0 = sc->buf[ 0];
|
||||
M1 = sc->buf[ 1];
|
||||
@@ -519,23 +804,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
|
||||
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
sc->buf[10] = v512_64( 0x8000000000000000ULL );
|
||||
sc->buf[11] =
|
||||
sc->buf[12] = m512_zero;
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[13] = v512_64( 1 );
|
||||
sc->buf[14] = m512_zero;
|
||||
sc->buf[15] = _mm512_set1_epi64( 80*8 );
|
||||
sc->buf[15] = v512_64( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -546,14 +831,14 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
|
||||
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
|
||||
VE = _mm512_set1_epi64( CB6 );
|
||||
VF = _mm512_set1_epi64( CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( CB4 ^ 0x280ULL );
|
||||
VD = v512_64( CB5 ^ 0x280ULL );
|
||||
VE = v512_64( CB6 );
|
||||
VF = v512_64( CB7 );
|
||||
|
||||
// round 0
|
||||
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||
@@ -565,7 +850,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
|
||||
|
||||
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
v512_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
|
||||
VA = _mm512_add_epi64( VA, VF );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
|
||||
@@ -578,7 +863,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// round 1
|
||||
// G1
|
||||
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
|
||||
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
|
||||
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( CB8 ),
|
||||
sc->buf[ 4] ) );
|
||||
|
||||
// G2
|
||||
@@ -588,7 +873,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// G3
|
||||
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
|
||||
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
v512_64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
|
||||
// save midstate for second part
|
||||
midstate[ 0] = V0;
|
||||
@@ -656,7 +941,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0 with the nonce now available
|
||||
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB8 ), M9 ) );
|
||||
v512_64( CB8 ), M9 ) );
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
|
||||
VA = _mm512_add_epi64( VA, VF );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
|
||||
@@ -667,14 +952,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
// G1
|
||||
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
|
||||
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
|
||||
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( c1 ), m0 );
|
||||
|
||||
V1 = _mm512_add_epi64( V1, V5 );
|
||||
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
|
||||
V9 = _mm512_add_epi64( V9, VD );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
|
||||
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
v512_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
|
||||
V9 = _mm512_add_epi64( V9, VD );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
|
||||
@@ -683,12 +968,12 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
|
||||
// V2 = _mm512_add_epi64( V2, V6 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBF ), M9 ) );
|
||||
v512_64( CBF ), M9 ) );
|
||||
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
|
||||
VA = _mm512_add_epi64( VA, VE );
|
||||
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
|
||||
v512_64( CB9 ), MF ), V6 ) );
|
||||
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
|
||||
VA = _mm512_add_epi64( VA, VE );
|
||||
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
|
||||
@@ -696,13 +981,13 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// G3
|
||||
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
|
||||
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||
// v512_64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
|
||||
VB = _mm512_add_epi64( VB, VF );
|
||||
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
|
||||
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
v512_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
|
||||
VB = _mm512_add_epi64( VB, VF );
|
||||
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
|
||||
@@ -744,14 +1029,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -815,7 +1100,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
|
||||
buf[ptr>>3] = v512_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -836,9 +1121,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||
_mm512_set1_epi64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
v512_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v512_64( bswap_64( th ) );
|
||||
buf[120>>3] = v512_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -850,9 +1135,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
buf[104>>3] = v512_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = v512_64( bswap_64( th ) );
|
||||
buf[120>>3] = v512_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
@@ -866,14 +1151,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -897,7 +1182,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
|
||||
sc->buf[ptr64] = v512_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -915,9 +1200,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
sc->buf[13] = v512_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = v512_64( bswap_64( th ) );
|
||||
sc->buf[15] = v512_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -933,14 +1218,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -964,7 +1249,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
sc->buf[ptr64] = v512_64( 0x8000000000000000ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -982,9 +1267,9 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[14] = _mm512_set1_epi64( th );
|
||||
sc->buf[15] = _mm512_set1_epi64( tl );
|
||||
sc->buf[13] = v512_64( 1 );
|
||||
sc->buf[14] = v512_64( th );
|
||||
sc->buf[15] = v512_64( tl );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -1013,12 +1298,12 @@ blake512_8way_close(void *cc, void *dst)
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
|
||||
{ \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
|
||||
v256_64( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
|
||||
v256_64( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
@@ -1054,14 +1339,14 @@ blake512_8way_close(void *cc, void *dst)
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_set1_epi64x( CB0 ); \
|
||||
V9 = _mm256_set1_epi64x( CB1 ); \
|
||||
VA = _mm256_set1_epi64x( CB2 ); \
|
||||
VB = _mm256_set1_epi64x( CB3 ); \
|
||||
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
|
||||
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
|
||||
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
|
||||
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
|
||||
V8 = v256_64( CB0 ); \
|
||||
V9 = v256_64( CB1 ); \
|
||||
VA = v256_64( CB2 ); \
|
||||
VB = v256_64( CB3 ); \
|
||||
VC = v256_64( T0 ^ CB4 ); \
|
||||
VD = v256_64( T0 ^ CB5 ); \
|
||||
VE = v256_64( T1 ^ CB6 ); \
|
||||
VF = v256_64( T1 ^ CB7 ); \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
@@ -1122,18 +1407,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
_mm256_set1_epi64x( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
_mm256_set1_epi64x( CB5 ) );
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
_mm256_set1_epi64x( CB6 ) );
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
_mm256_set1_epi64x( CB7 ) );
|
||||
V8 = v256_64( CB0 );
|
||||
V9 = v256_64( CB1 );
|
||||
VA = v256_64( CB2 );
|
||||
VB = v256_64( CB3 );
|
||||
VC = _mm256_xor_si256( v256_64( sc->T0 ),
|
||||
v256_64( CB4 ) );
|
||||
VD = _mm256_xor_si256( v256_64( sc->T0 ),
|
||||
v256_64( CB5 ) );
|
||||
VE = _mm256_xor_si256( v256_64( sc->T1 ),
|
||||
v256_64( CB6 ) );
|
||||
VF = _mm256_xor_si256( v256_64( sc->T1 ),
|
||||
v256_64( CB7 ) );
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
@@ -1188,23 +1473,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
|
||||
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
|
||||
sc->buf[10] = v256_64( 0x8000000000000000ULL );
|
||||
sc->buf[11] = m256_zero;
|
||||
sc->buf[12] = m256_zero;
|
||||
sc->buf[13] = m256_one_64;
|
||||
sc->buf[13] = v256_64( 1 );
|
||||
sc->buf[14] = m256_zero;
|
||||
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
|
||||
sc->buf[15] = v256_64( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -1215,14 +1500,14 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
|
||||
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
|
||||
VE = _mm256_set1_epi64x( CB6 );
|
||||
VF = _mm256_set1_epi64x( CB7 );
|
||||
V8 = v256_64( CB0 );
|
||||
V9 = v256_64( CB1 );
|
||||
VA = v256_64( CB2 );
|
||||
VB = v256_64( CB3 );
|
||||
VC = v256_64( CB4 ^ 0x280ULL );
|
||||
VD = v256_64( CB5 ^ 0x280ULL );
|
||||
VE = v256_64( CB6 );
|
||||
VF = v256_64( CB7 );
|
||||
|
||||
// round 0
|
||||
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||
@@ -1232,7 +1517,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// G4 skip nonce
|
||||
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
v256_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
|
||||
@@ -1244,7 +1529,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// round 1
|
||||
// G1
|
||||
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
|
||||
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( v256_64( CB8 ),
|
||||
sc->buf[ 4] ) );
|
||||
|
||||
// G2
|
||||
@@ -1252,7 +1537,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// G3
|
||||
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
|
||||
v256_64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
|
||||
// save midstate for second part
|
||||
midstate[ 0] = V0;
|
||||
@@ -1319,7 +1604,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0, with the nonce now available
|
||||
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB8 ), M9 ) );
|
||||
v256_64( CB8 ), M9 ) );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
|
||||
@@ -1334,19 +1619,19 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
|
||||
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
v256_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
|
||||
|
||||
// G2
|
||||
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBF ), M9 ) );
|
||||
v256_64( CBF ), M9 ) );
|
||||
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
|
||||
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
|
||||
v256_64( CB9 ), MF ), V6 ) );
|
||||
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
|
||||
@@ -1356,7 +1641,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
|
||||
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
v256_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
|
||||
@@ -1398,14 +1683,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1465,7 +1750,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
|
||||
buf[ptr>>3] = v256_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -1487,9 +1772,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
v256_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -1501,9 +1786,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
buf[104>>3] = v256_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
@@ -1517,14 +1802,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1548,7 +1833,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
|
||||
sc->buf[ptr64] = v256_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if ( sc->ptr == 0 )
|
||||
@@ -1565,9 +1850,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
sc->buf[13] = v256_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = v256_64( bswap_64( th ) );
|
||||
sc->buf[15] = v256_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -1589,4 +1874,4 @@ blake512_4way_close(void *cc, void *dst)
|
||||
blake64_4way_close( cc, dst );
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
83
algo/blake/blake512-hash.h
Normal file
83
algo/blake/blake512-hash.h
Normal file
@@ -0,0 +1,83 @@
|
||||
#ifndef BLAKE512_HASH__
|
||||
#define BLAKE512_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2 & AVX2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
uint64_t H[8];
|
||||
uint64_t T0, T1;
|
||||
size_t ptr;
|
||||
} blake512_context __attribute__ ((aligned (32)));
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
void blake512_init( blake512_context *sc );
|
||||
void blake512_update( blake512_context *sc, const void *data, size_t len );
|
||||
void blake512_close( blake512_context *sc, void *dst );
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE512_HASH_H__
|
@@ -1,5 +1,5 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
@@ -30,25 +30,25 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
block0_hash[0] = v512_32( phash[0] );
|
||||
block0_hash[1] = v512_32( phash[1] );
|
||||
block0_hash[2] = v512_32( phash[2] );
|
||||
block0_hash[3] = v512_32( phash[3] );
|
||||
block0_hash[4] = v512_32( phash[4] );
|
||||
block0_hash[5] = v512_32( phash[5] );
|
||||
block0_hash[6] = v512_32( phash[6] );
|
||||
block0_hash[7] = v512_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[0] = v512_32( pdata[16] );
|
||||
block_buf[1] = v512_32( pdata[17] );
|
||||
block_buf[2] = v512_32( pdata[18] );
|
||||
block_buf[3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
@@ -101,25 +101,25 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
block0_hash[0] = v256_32( phash[0] );
|
||||
block0_hash[1] = v256_32( phash[1] );
|
||||
block0_hash[2] = v256_32( phash[2] );
|
||||
block0_hash[3] = v256_32( phash[3] );
|
||||
block0_hash[4] = v256_32( phash[4] );
|
||||
block0_hash[5] = v256_32( phash[5] );
|
||||
block0_hash[6] = v256_32( phash[6] );
|
||||
block0_hash[7] = v256_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[0] = v256_32( pdata[16] );
|
||||
block_buf[1] = v256_32( pdata[17] );
|
||||
block_buf[2] = v256_32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
@@ -203,142 +203,3 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
//#if defined(BLAKECOIN_8WAY)
|
||||
|
||||
blake256r8_8way_context blakecoin_8w_ctx;
|
||||
|
||||
void blakecoin_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256r8_8way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = (uint32_t*)work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 8 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= HTarget )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -6,9 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
#include "sph_blake.h"
|
||||
#include "blake512-hash.h"
|
||||
|
||||
extern void pentablakehash_4way( void *output, const void *input )
|
||||
{
|
||||
|
@@ -14,8 +14,9 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "compat/sph_types.h"
|
||||
#include "compat.h"
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
@@ -208,8 +209,8 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
|
||||
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
|
||||
{
|
||||
uint32_t m[16];
|
||||
uint32_t v[16];
|
||||
uint32_t _ALIGN(32) m[16];
|
||||
uint32_t _ALIGN(32) v[16];
|
||||
|
||||
for( size_t i = 0; i < 16; ++i )
|
||||
m[i] = load32( block + i * sizeof( m[i] ) );
|
||||
@@ -225,6 +226,58 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
__m128i *V = (__m128i*)v;
|
||||
|
||||
#define BLAKE2S_ROUND( r ) \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
|
||||
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
|
||||
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shufll_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shuflr_32( V[2] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
|
||||
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
|
||||
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shuflr_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shufll_32( V[2] )
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
BLAKE2S_ROUND(2);
|
||||
BLAKE2S_ROUND(3);
|
||||
BLAKE2S_ROUND(4);
|
||||
BLAKE2S_ROUND(5);
|
||||
BLAKE2S_ROUND(6);
|
||||
BLAKE2S_ROUND(7);
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
#undef BLAKE2S_ROUND
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||
@@ -236,6 +289,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
c = c + d; \
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND(r) \
|
||||
do { \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
@@ -247,7 +301,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
ROUND( 0 );
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
@@ -258,6 +313,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
ROUND( 8 );
|
||||
ROUND( 9 );
|
||||
|
||||
#endif
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
|
@@ -383,65 +383,77 @@ static const uint32_t T512[64][16] = {
|
||||
#define S1E ME
|
||||
#define S1F MF
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel says _mm512_movepi64_mask has (1L/1T) timimg while
|
||||
// _mm512_cmplt_epi64_mask as (3L/1T) timing, however, when tested hashing X13
|
||||
// on i9-9940x cmplt with zero was 3% faster than movepi.
|
||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
||||
// had a 3% faster overall hashrate than than using movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
do { \
|
||||
{ \
|
||||
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
|
||||
const __m512i zero = m512_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
for ( int i = 0; i < 64*8; i += 8, db = _mm512_ror_epi64( db, 1 ) ) \
|
||||
{ \
|
||||
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
|
||||
m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[0] ) ); \
|
||||
m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[1] ) ); \
|
||||
m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[2] ) ); \
|
||||
m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[3] ) ); \
|
||||
m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[4] ) ); \
|
||||
m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[5] ) ); \
|
||||
m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[6] ) ); \
|
||||
m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[7] ) ); \
|
||||
db = _mm512_ror_epi64( db, 1 ); \
|
||||
tp += 8; \
|
||||
m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[i+0] ) ); \
|
||||
m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[i+1] ) ); \
|
||||
m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[i+2] ) ); \
|
||||
m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[i+3] ) ); \
|
||||
m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[i+4] ) ); \
|
||||
m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[i+5] ) ); \
|
||||
m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[i+6] ) ); \
|
||||
m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[i+7] ) ); \
|
||||
} \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
{ \
|
||||
__m512i tb, td; \
|
||||
td = mm512_xorand( d, a, c ); \
|
||||
c = mm512_xor3( c, td, b ); \
|
||||
tb = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
b = mm512_xoror( td, tb, a ); \
|
||||
td = mm512_xorand( a, td, tb ); \
|
||||
a = c; \
|
||||
c = mm512_xor3( tb, b, td ); \
|
||||
d = mm512_not( td ); \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
__m512i t; \
|
||||
t = a; \
|
||||
a = mm512_xorand( d, a, c ); \
|
||||
c = mm512_xor3( a, b, c ); \
|
||||
b = mm512_xoror( b, d, t ); \
|
||||
t = _mm512_xor_si512( t, c ); \
|
||||
d = mm512_xoror( a, b, t ); \
|
||||
t = mm512_xorand( t, a, b ); \
|
||||
__m512i t = mm512_xorand( d, a, c ); \
|
||||
c = mm512_xor3( c, t, b ); \
|
||||
b = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
d = mm512_xoror( t, b, a ); \
|
||||
t = mm512_xorand( a, t, b ); \
|
||||
a = c; \
|
||||
c = mm512_xor3( b, d, t ); \
|
||||
b = d; \
|
||||
d = mm512_not( t ); \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define L8( a, b, c, d ) \
|
||||
do { \
|
||||
a = mm512_rol_32( a, 13 ); \
|
||||
c = mm512_rol_32( c, 3 ); \
|
||||
b = mm512_xor3( a, b, c ); \
|
||||
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
|
||||
b = mm512_rol_32( b, 1 ); \
|
||||
b = mm512_xor3( a, b, c ); \
|
||||
d = mm512_rol_32( d, 7 ); \
|
||||
a = mm512_xor3( a, b, d ); \
|
||||
b = mm512_rol_32( b, 1 ); \
|
||||
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
|
||||
a = mm512_rol_32( a, 5 ); \
|
||||
a = mm512_xor3( a, b, d ); \
|
||||
c = mm512_rol_32( c, 22 ); \
|
||||
} while (0)
|
||||
a = mm512_rol_32( a, 5 );
|
||||
|
||||
#define DECL_STATE_BIG8 \
|
||||
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||
@@ -628,7 +640,7 @@ do { /* order is important */ \
|
||||
|
||||
///////////////////////
|
||||
//
|
||||
// Experimental
|
||||
// Found to be slower than running 8x64 twice.
|
||||
|
||||
// Hamsi 16 way 32 bit.
|
||||
|
||||
@@ -674,75 +686,128 @@ do { /* order is important */ \
|
||||
|
||||
#define INPUT_16X32 \
|
||||
{ \
|
||||
const __m512i zero = _mm512_setzero_si512(); \
|
||||
const uint32_t *tp = (const uint32_t*)T512; \
|
||||
const __m512i zero = (const __m512i)_mm512_setzero_si512(); \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
||||
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
||||
for ( int v = 0; v < 2; v++ ) \
|
||||
__m512i db = _mm512_ror_epi32( buf[0], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__m512i db = _mm512_ror_epi32( buf[v], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__mmask16 dm = _mm512_cmplt_epi32_mask( db, zero ); \
|
||||
M0 = _mm512_mask_xor_epi32( M0, dm, M0, v512_32( tp[ 0] ) ); \
|
||||
M1 = _mm512_mask_xor_epi32( M1, dm, M1, v512_32( tp[ 1] ) ); \
|
||||
M2 = _mm512_mask_xor_epi32( M2, dm, M2, v512_32( tp[ 2] ) ); \
|
||||
M3 = _mm512_mask_xor_epi32( M3, dm, M3, v512_32( tp[ 3] ) ); \
|
||||
M4 = _mm512_mask_xor_epi32( M4, dm, M4, v512_32( tp[ 4] ) ); \
|
||||
M5 = _mm512_mask_xor_epi32( M5, dm, M5, v512_32( tp[ 5] ) ); \
|
||||
M6 = _mm512_mask_xor_epi32( M6, dm, M6, v512_32( tp[ 6] ) ); \
|
||||
M7 = _mm512_mask_xor_epi32( M7, dm, M7, v512_32( tp[ 7] ) ); \
|
||||
M8 = _mm512_mask_xor_epi32( M8, dm, M8, v512_32( tp[ 8] ) ); \
|
||||
M9 = _mm512_mask_xor_epi32( M9, dm, M9, v512_32( tp[ 9] ) ); \
|
||||
MA = _mm512_mask_xor_epi32( MA, dm, MA, v512_32( tp[10] ) ); \
|
||||
MB = _mm512_mask_xor_epi32( MB, dm, MB, v512_32( tp[11] ) ); \
|
||||
MC = _mm512_mask_xor_epi32( MC, dm, MC, v512_32( tp[12] ) ); \
|
||||
MD = _mm512_mask_xor_epi32( MD, dm, MD, v512_32( tp[13] ) ); \
|
||||
ME = _mm512_mask_xor_epi32( ME, dm, ME, v512_32( tp[14] ) ); \
|
||||
MF = _mm512_mask_xor_epi32( MF, dm, MF, v512_32( tp[15] ) ); \
|
||||
db = _mm512_ror_epi32( db, 1 ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero );\
|
||||
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
|
||||
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
|
||||
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
|
||||
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
|
||||
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
|
||||
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
|
||||
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
|
||||
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
|
||||
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
|
||||
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
|
||||
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
|
||||
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
|
||||
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
|
||||
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
|
||||
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
|
||||
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
|
||||
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
|
||||
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
|
||||
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
|
||||
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
|
||||
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
|
||||
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
|
||||
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
|
||||
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
|
||||
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
|
||||
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
|
||||
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
|
||||
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
|
||||
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
|
||||
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
|
||||
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
|
||||
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
|
||||
db = _mm512_ror_epi32( db, 1 ); \
|
||||
tp += 8; \
|
||||
} \
|
||||
db = _mm512_ror_epi32( buf[1], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero ); \
|
||||
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
|
||||
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
|
||||
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
|
||||
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
|
||||
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
|
||||
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
|
||||
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
|
||||
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
|
||||
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
|
||||
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
|
||||
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
|
||||
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
|
||||
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
|
||||
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
|
||||
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
|
||||
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
|
||||
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
|
||||
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
|
||||
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
|
||||
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
|
||||
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
|
||||
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
|
||||
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
|
||||
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
|
||||
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
|
||||
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
|
||||
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
|
||||
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
|
||||
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
|
||||
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
|
||||
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
|
||||
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
|
||||
db = _mm512_ror_epi32( db, 1 ); \
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define SBOX_16X32 SBOX8
|
||||
#define L_16X32 L8
|
||||
|
||||
#define ROUND_16X32( rc, alpha ) \
|
||||
#define ROUND_16X32( alpha ) \
|
||||
{ \
|
||||
S00 = _mm512_xor_si512( S00, v512_32( alpha[ 0] ) ); \
|
||||
S01 = _mm512_xor_si512( S01, v512_32( alpha[ 1] ^ rc ) ); \
|
||||
S02 = _mm512_xor_si512( S02, v512_32( alpha[ 2] ) ); \
|
||||
S03 = _mm512_xor_si512( S03, v512_32( alpha[ 3] ) ); \
|
||||
S04 = _mm512_xor_si512( S04, v512_32( alpha[ 4] ) ); \
|
||||
S05 = _mm512_xor_si512( S05, v512_32( alpha[ 5] ) ); \
|
||||
S06 = _mm512_xor_si512( S06, v512_32( alpha[ 6] ) ); \
|
||||
S07 = _mm512_xor_si512( S07, v512_32( alpha[ 7] ) ); \
|
||||
S08 = _mm512_xor_si512( S08, v512_32( alpha[ 8] ) ); \
|
||||
S09 = _mm512_xor_si512( S09, v512_32( alpha[ 9] ) ); \
|
||||
S0A = _mm512_xor_si512( S0A, v512_32( alpha[10] ) ); \
|
||||
S0B = _mm512_xor_si512( S0B, v512_32( alpha[11] ) ); \
|
||||
S0C = _mm512_xor_si512( S0C, v512_32( alpha[12] ) ); \
|
||||
S0D = _mm512_xor_si512( S0D, v512_32( alpha[13] ) ); \
|
||||
S0E = _mm512_xor_si512( S0E, v512_32( alpha[14] ) ); \
|
||||
S0F = _mm512_xor_si512( S0F, v512_32( alpha[15] ) ); \
|
||||
S10 = _mm512_xor_si512( S10, v512_32( alpha[16] ) ); \
|
||||
S11 = _mm512_xor_si512( S11, v512_32( alpha[17] ) ); \
|
||||
S12 = _mm512_xor_si512( S12, v512_32( alpha[18] ) ); \
|
||||
S13 = _mm512_xor_si512( S13, v512_32( alpha[19] ) ); \
|
||||
S14 = _mm512_xor_si512( S14, v512_32( alpha[20] ) ); \
|
||||
S15 = _mm512_xor_si512( S15, v512_32( alpha[21] ) ); \
|
||||
S16 = _mm512_xor_si512( S16, v512_32( alpha[22] ) ); \
|
||||
S17 = _mm512_xor_si512( S17, v512_32( alpha[23] ) ); \
|
||||
S18 = _mm512_xor_si512( S18, v512_32( alpha[24] ) ); \
|
||||
S19 = _mm512_xor_si512( S19, v512_32( alpha[25] ) ); \
|
||||
S1A = _mm512_xor_si512( S1A, v512_32( alpha[26] ) ); \
|
||||
S1B = _mm512_xor_si512( S1B, v512_32( alpha[27] ) ); \
|
||||
S1C = _mm512_xor_si512( S1C, v512_32( alpha[28] ) ); \
|
||||
S1D = _mm512_xor_si512( S1D, v512_32( alpha[29] ) ); \
|
||||
S1E = _mm512_xor_si512( S1E, v512_32( alpha[30] ) ); \
|
||||
S1F = _mm512_xor_si512( S1F, v512_32( alpha[31] ) ); \
|
||||
S00 = _mm512_xor_si512( S00, alpha[ 0] ); \
|
||||
S01 = _mm512_xor_si512( S01, alpha[ 1] ); \
|
||||
S02 = _mm512_xor_si512( S02, alpha[ 2] ); \
|
||||
S03 = _mm512_xor_si512( S03, alpha[ 3] ); \
|
||||
S04 = _mm512_xor_si512( S04, alpha[ 4] ); \
|
||||
S05 = _mm512_xor_si512( S05, alpha[ 5] ); \
|
||||
S06 = _mm512_xor_si512( S06, alpha[ 6] ); \
|
||||
S07 = _mm512_xor_si512( S07, alpha[ 7] ); \
|
||||
S08 = _mm512_xor_si512( S08, alpha[ 8] ); \
|
||||
S09 = _mm512_xor_si512( S09, alpha[ 9] ); \
|
||||
S0A = _mm512_xor_si512( S0A, alpha[10] ); \
|
||||
S0B = _mm512_xor_si512( S0B, alpha[11] ); \
|
||||
S0C = _mm512_xor_si512( S0C, alpha[12] ); \
|
||||
S0D = _mm512_xor_si512( S0D, alpha[13] ); \
|
||||
S0E = _mm512_xor_si512( S0E, alpha[14] ); \
|
||||
S0F = _mm512_xor_si512( S0F, alpha[15] ); \
|
||||
S10 = _mm512_xor_si512( S10, alpha[16] ); \
|
||||
S11 = _mm512_xor_si512( S11, alpha[17] ); \
|
||||
S12 = _mm512_xor_si512( S12, alpha[18] ); \
|
||||
S13 = _mm512_xor_si512( S13, alpha[19] ); \
|
||||
S14 = _mm512_xor_si512( S14, alpha[20] ); \
|
||||
S15 = _mm512_xor_si512( S15, alpha[21] ); \
|
||||
S16 = _mm512_xor_si512( S16, alpha[22] ); \
|
||||
S17 = _mm512_xor_si512( S17, alpha[23] ); \
|
||||
S18 = _mm512_xor_si512( S18, alpha[24] ); \
|
||||
S19 = _mm512_xor_si512( S19, alpha[25] ); \
|
||||
S1A = _mm512_xor_si512( S1A, alpha[26] ); \
|
||||
S1B = _mm512_xor_si512( S1B, alpha[27] ); \
|
||||
S1C = _mm512_xor_si512( S1C, alpha[28] ); \
|
||||
S1D = _mm512_xor_si512( S1D, alpha[29] ); \
|
||||
S1E = _mm512_xor_si512( S1E, alpha[30] ); \
|
||||
S1F = _mm512_xor_si512( S1F, alpha[31] ); \
|
||||
SBOX_16X32( S00, S08, S10, S18 ); \
|
||||
SBOX_16X32( S01, S09, S11, S19 ); \
|
||||
SBOX_16X32( S02, S0A, S12, S1A ); \
|
||||
@@ -766,26 +831,54 @@ do { /* order is important */ \
|
||||
}
|
||||
|
||||
#define P_16X32 \
|
||||
ROUND_16X32( 0, alpha_n ); \
|
||||
ROUND_16X32( 1, alpha_n ); \
|
||||
ROUND_16X32( 2, alpha_n ); \
|
||||
ROUND_16X32( 3, alpha_n ); \
|
||||
ROUND_16X32( 4, alpha_n ); \
|
||||
ROUND_16X32( 5, alpha_n );
|
||||
{ \
|
||||
__m512i alpha[32]; \
|
||||
const uint32_t A1 = ( (const uint32_t*)alpha_n )[1]; \
|
||||
for( int i = 0; i < 32; i++ ) \
|
||||
alpha[i] = v512_32( ( (uint32_t*)alpha_n )[i] ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 1 ^ (A1) ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 2 ^ (A1) ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 3 ^ (A1) ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 4 ^ (A1) ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 5 ^ (A1) ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
}
|
||||
|
||||
#define PF_16X32 \
|
||||
ROUND_16X32( 0, alpha_f ); \
|
||||
ROUND_16X32( 1, alpha_f ); \
|
||||
ROUND_16X32( 2, alpha_f ); \
|
||||
ROUND_16X32( 3, alpha_f ); \
|
||||
ROUND_16X32( 4, alpha_f ); \
|
||||
ROUND_16X32( 5, alpha_f ); \
|
||||
ROUND_16X32( 6, alpha_f ); \
|
||||
ROUND_16X32( 7, alpha_f ); \
|
||||
ROUND_16X32( 8, alpha_f ); \
|
||||
ROUND_16X32( 9, alpha_f ); \
|
||||
ROUND_16X32( 10, alpha_f ); \
|
||||
ROUND_16X32( 11, alpha_f );
|
||||
{ \
|
||||
__m512i alpha[32]; \
|
||||
const uint32_t A1 = ( (const uint32_t*)alpha_f )[1]; \
|
||||
for( int i = 0; i < 32; i++ ) \
|
||||
alpha[i] = v512_32( ( (uint32_t*)alpha_f )[i] ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 1 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 2 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 3 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 4 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 5 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 6 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 7 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 8 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 9 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 10 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
alpha[1] = v512_32( 11 ^ A1 ); \
|
||||
ROUND_16X32( alpha ); \
|
||||
}
|
||||
|
||||
#define T_16X32 \
|
||||
/* order is important */ \
|
||||
@@ -841,7 +934,7 @@ void hamsi_16x32_big_final( hamsi_16x32_big_context *sc, __m512i *buf )
|
||||
WRITE_STATE_16X32( sc );
|
||||
}
|
||||
|
||||
void hamsi512_16x32_init( hamsi_16x32_big_context *sc )
|
||||
void hamsi512_16x32_init( hamsi512_16x32_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
@@ -863,7 +956,7 @@ void hamsi512_16x32_init( hamsi_16x32_big_context *sc )
|
||||
sc->h[15] = v512_32( HAMSI_IV512[15] );
|
||||
}
|
||||
|
||||
void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data,
|
||||
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
@@ -875,7 +968,7 @@ void hamsi512_16x32_update( hamsi_16x32_big_context *sc, const void *data,
|
||||
sc->partial_len = len;
|
||||
}
|
||||
|
||||
void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst )
|
||||
void hamsi512_16x32_close( hamsi512_16x32_context *sc, void *dst )
|
||||
{
|
||||
__m512i pad[2];
|
||||
uint32_t ch, cl;
|
||||
@@ -893,7 +986,7 @@ void hamsi512_16x32_close( hamsi_16x32_big_context *sc, void *dst )
|
||||
mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
|
||||
}
|
||||
|
||||
void hamsi512_16x32_full( hamsi_16x32_big_context *sc, void *dst,
|
||||
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
// init
|
||||
@@ -996,16 +1089,6 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
sc->h[5] = v512_64( iv[5] );
|
||||
sc->h[6] = v512_64( iv[6] );
|
||||
sc->h[7] = v512_64( iv[7] );
|
||||
/*
|
||||
sc->h[0] = v512_64( 0x6c70617273746565 );
|
||||
sc->h[1] = v512_64( 0x656e62656b204172 );
|
||||
sc->h[2] = v512_64( 0x302c206272672031 );
|
||||
sc->h[3] = v512_64( 0x3434362c75732032 );
|
||||
sc->h[4] = v512_64( 0x3030312020422d33 );
|
||||
sc->h[5] = v512_64( 0x656e2d484c657576 );
|
||||
sc->h[6] = v512_64( 0x6c65652c65766572 );
|
||||
sc->h[7] = v512_64( 0x6769756d2042656c );
|
||||
*/
|
||||
}
|
||||
|
||||
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||
@@ -1047,22 +1130,34 @@ do { \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 0; u < 64; u++ ) \
|
||||
for ( int i = 0; i < 64*8; i+=8, db = _mm256_ror_epi64( db, 1 ) ) \
|
||||
{ \
|
||||
const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
|
||||
m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[0] ) ); \
|
||||
m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[1] ) ); \
|
||||
m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[2] ) ); \
|
||||
m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[3] ) ); \
|
||||
m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[4] ) ); \
|
||||
m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[5] ) ); \
|
||||
m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[6] ) ); \
|
||||
m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[7] ) ); \
|
||||
db = _mm256_ror_epi64( db, 1 ); \
|
||||
tp += 8; \
|
||||
m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[i+0] ) ); \
|
||||
m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[i+1] ) ); \
|
||||
m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[i+2] ) ); \
|
||||
m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[i+3] ) ); \
|
||||
m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[i+4] ) ); \
|
||||
m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[i+5] ) ); \
|
||||
m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[i+6] ) ); \
|
||||
m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[i+7] ) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// v3 ternary logic, 8 instructions, 2 local vars
|
||||
#define SBOX( a, b, c, d ) \
|
||||
{ \
|
||||
__m256i tb, td; \
|
||||
td = mm256_xorand( d, a, c ); \
|
||||
tb = mm256_xoror( b, d, a ); \
|
||||
c = mm256_xor3( c, td, b ); \
|
||||
a = _mm256_xor_si256( a, c ); \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define INPUT_BIG \
|
||||
@@ -1071,46 +1166,54 @@ do { \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int u = 63; u >= 0; u-- ) \
|
||||
for ( int i = 63; i >= 0; i-- ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
|
||||
v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
|
||||
v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
|
||||
v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
|
||||
v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
|
||||
v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
|
||||
v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
|
||||
v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
|
||||
v256_64( tp[7] ) ) ); \
|
||||
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, i ) ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
{ \
|
||||
__m256i tb, td; \
|
||||
td = mm256_xorand( d, a, c ); \
|
||||
tb = mm256_xoror( b, d, a ); \
|
||||
c = mm256_xor3( c, td, b ); \
|
||||
a = _mm256_xor_si256( a, c ); \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
td = mm256_xorand( a, td, tb ); \
|
||||
a = c; \
|
||||
c = mm256_xor3( tb, b, td ); \
|
||||
d = mm256_not( td ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
/ v2, 16 instructions, 10 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = a; \
|
||||
a = mm256_xorand( d, a, c ); \
|
||||
c = mm256_xor3( a, b, c ); \
|
||||
b = mm256_xoror( b, d, t ); \
|
||||
t = _mm256_xor_si256( t, c ); \
|
||||
d = mm256_xoror( a, b, t ); \
|
||||
t = mm256_xorand( t, a, b ); \
|
||||
{ \
|
||||
__m256i t = mm256_xorand( d, a, c ); \
|
||||
c = mm256_xor3( t, b, c ); \
|
||||
b = mm256_xoror( b, d, a); \
|
||||
a = _mm256_xor_si256( a, c ); \
|
||||
d = mm256_xoror( t, b, a ); \
|
||||
t = mm256_xorand( a, t, b ); \
|
||||
a = c; \
|
||||
c = mm256_xor3( b, d, t ); \
|
||||
b = d; \
|
||||
d = mm256_not( t ); \
|
||||
} while (0)
|
||||
}
|
||||
*/
|
||||
|
||||
#define L( a, b, c, d ) \
|
||||
do { \
|
||||
@@ -1127,6 +1230,7 @@ do { \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
// original, 18 instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
@@ -1405,31 +1509,51 @@ do { /* order is important */ \
|
||||
const uint32_t *tp = (const uint32_t*)T512; \
|
||||
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
||||
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
||||
for ( int v = 0; v < 2; v++ ) \
|
||||
__m256i db = _mm256_ror_epi32( buf[0], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__m256i db = _mm256_ror_epi32( buf[v], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
|
||||
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
|
||||
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
|
||||
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
|
||||
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
|
||||
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
|
||||
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
|
||||
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
|
||||
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
|
||||
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
|
||||
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
|
||||
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
|
||||
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
|
||||
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
|
||||
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
|
||||
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
|
||||
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
|
||||
db = _mm256_ror_epi32( db, 1 ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
|
||||
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
|
||||
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
|
||||
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
|
||||
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
|
||||
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
|
||||
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
|
||||
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
|
||||
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
|
||||
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
|
||||
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
|
||||
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
|
||||
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
|
||||
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
|
||||
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
|
||||
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
|
||||
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
|
||||
db = _mm256_ror_epi32( db, 1 ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
db = _mm256_ror_epi32( buf[1], 1 ); \
|
||||
for ( int u = 0; u < 32; u++ ) \
|
||||
{ \
|
||||
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
|
||||
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
|
||||
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
|
||||
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
|
||||
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
|
||||
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
|
||||
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
|
||||
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
|
||||
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
|
||||
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
|
||||
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
|
||||
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
|
||||
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
|
||||
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
|
||||
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
|
||||
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
|
||||
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
|
||||
db = _mm256_ror_epi32( db, 1 ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -1441,30 +1565,49 @@ do { /* order is important */ \
|
||||
const uint32_t *tp = (const uint32_t*)T512; \
|
||||
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
||||
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
||||
for ( int v = 0; v < 2; v++ ) \
|
||||
__m256i db = buf[0]; \
|
||||
for ( int u = 31; u >= 0; u-- ) \
|
||||
{ \
|
||||
__m256i db = buf[v]; \
|
||||
for ( int u = 31; u >= 0; u-- ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
|
||||
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
|
||||
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
|
||||
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
|
||||
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
|
||||
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
|
||||
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
|
||||
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
|
||||
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
|
||||
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
|
||||
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
|
||||
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
|
||||
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
|
||||
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
|
||||
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
|
||||
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
|
||||
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
|
||||
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
|
||||
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
|
||||
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
|
||||
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
|
||||
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
|
||||
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
|
||||
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
|
||||
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
|
||||
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
|
||||
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
|
||||
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
|
||||
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
|
||||
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
|
||||
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
|
||||
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
|
||||
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
db = buf[1]; \
|
||||
for ( int u = 31; u >= 0; u-- ) \
|
||||
{ \
|
||||
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
|
||||
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
|
||||
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
|
||||
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
|
||||
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
|
||||
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
|
||||
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
|
||||
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
|
||||
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
|
||||
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
|
||||
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
|
||||
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
|
||||
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
|
||||
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
|
||||
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
|
||||
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
|
||||
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
|
||||
tp += 16; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
115
algo/haval/haval-16way-helper.c
Normal file
115
algo/haval/haval-16way-helper.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
|
||||
/*
|
||||
* Helper code, included (three times !) by HAVAL implementation.
|
||||
*
|
||||
* TODO: try to merge this with md_helper.c.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_update)
|
||||
( haval_16way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
unsigned current;
|
||||
|
||||
current = (unsigned)sc->count_low & 127U;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (current>>2), vdata, clen>>2 );
|
||||
vdata += clen>>2;
|
||||
current += clen;
|
||||
len -= clen;
|
||||
if ( current == 128U )
|
||||
{
|
||||
DSTATE_16W;
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
RSTATE_16W;
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
WSTATE_16W;
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_close)( haval_16way_context *sc,
|
||||
void *dst)
|
||||
{
|
||||
unsigned current;
|
||||
DSTATE_16W;
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = v512_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_16W;
|
||||
if ( current > 116UL )
|
||||
{
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
current = 0;
|
||||
}
|
||||
|
||||
uint32_t t1, t2;
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = v512_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v512_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v512_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
WSTATE_16W;
|
||||
haval_16way_out( sc, dst );
|
||||
}
|
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m128_one_32;
|
||||
sc->buf[ current>>2 ] = v128_32( 1 );
|
||||
current += 4;
|
||||
RSTATE;
|
||||
if ( current > 116UL )
|
||||
|
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m256_one_32;
|
||||
sc->buf[ current>>2 ] = v256_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_8W;
|
||||
if ( current > 116UL )
|
||||
@@ -101,9 +101,9 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
|
||||
sc->buf[ 116>>2 ] = v256_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v256_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v256_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
|
@@ -189,7 +189,7 @@ do { \
|
||||
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
|
||||
mm128_ror_32( x7, 11 ) ), \
|
||||
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
|
||||
_mm_add_epi32( w, v128_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -471,14 +471,14 @@ do { \
|
||||
static void
|
||||
haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
|
||||
sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
|
||||
sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
|
||||
sc->s3 = _mm_set1_epi32( 0x03707344UL );
|
||||
sc->s4 = _mm_set1_epi32( 0xA4093822UL );
|
||||
sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
|
||||
sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
|
||||
sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
|
||||
sc->s0 = v128_32( 0x243F6A88UL );
|
||||
sc->s1 = v128_32( 0x85A308D3UL );
|
||||
sc->s2 = v128_32( 0x13198A2EUL );
|
||||
sc->s3 = v128_32( 0x03707344UL );
|
||||
sc->s4 = v128_32( 0xA4093822UL );
|
||||
sc->s5 = v128_32( 0x299F31D0UL );
|
||||
sc->s6 = v128_32( 0x082EFA98UL );
|
||||
sc->s7 = v128_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -662,7 +662,7 @@ do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
_mm256_add_epi32( w, v256_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -793,14 +793,14 @@ do { \
|
||||
static void
|
||||
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
|
||||
sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
|
||||
sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
|
||||
sc->s3 = _mm256_set1_epi32( 0x03707344UL );
|
||||
sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
|
||||
sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
|
||||
sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
|
||||
sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
|
||||
sc->s0 = v256_32( 0x243F6A88UL );
|
||||
sc->s1 = v256_32( 0x85A308D3UL );
|
||||
sc->s2 = v256_32( 0x13198A2EUL );
|
||||
sc->s3 = v256_32( 0x03707344UL );
|
||||
sc->s4 = v256_32( 0xA4093822UL );
|
||||
sc->s5 = v256_32( 0x299F31D0UL );
|
||||
sc->s6 = v256_32( 0x082EFA98UL );
|
||||
sc->s7 = v256_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -879,10 +879,300 @@ do { \
|
||||
|
||||
#define INMSG_8W(i) msg[i]
|
||||
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm512_andnotxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#define F1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, mm512_andxor( x1, x0, x4 ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) ) \
|
||||
|
||||
#define F2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( mm512_andxor( x2, _mm512_andnot_si512( x3, x1 ), \
|
||||
mm512_xor3( _mm512_and_si512( x4, x5 ), x6, x0 ) ), \
|
||||
mm512_andxor( x4, x1, x5 ), \
|
||||
mm512_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, \
|
||||
_mm512_and_si512( x3, \
|
||||
mm512_xor3( _mm512_and_si512( x1, x2 ), x6, x0 ) ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ) ) )
|
||||
|
||||
#define F4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( \
|
||||
mm512_andxor( x3, x5, \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x2 ), \
|
||||
_mm512_or_si512( x4, x6 ) ) ), \
|
||||
_mm512_and_si512( x4, \
|
||||
mm512_xor3( x0, _mm512_andnot_si512( x2, x5 ), \
|
||||
_mm512_xor_si512( x1, x6 ) ) ), \
|
||||
mm512_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm512_xor_si512( \
|
||||
mm512_andnotxor( mm512_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm512_xor3( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x1, x0, x3, x5, x6, x2, x4)
|
||||
#define FP3_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x4, x2, x1, x0, x5, x3, x6)
|
||||
#define FP3_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x6, x1, x2, x3, x4, x5, x0)
|
||||
|
||||
#define FP4_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x2, x6, x1, x4, x5, x3, x0)
|
||||
#define FP4_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x3, x5, x2, x0, x1, x6, x4)
|
||||
#define FP4_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x1, x4, x3, x6, x0, x2, x5)
|
||||
#define FP4_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x6, x4, x0, x5, x2, x1, x3)
|
||||
|
||||
#define FP5_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x3, x4, x1, x0, x5, x2, x6)
|
||||
#define FP5_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x6, x2, x1, x0, x3, x4, x5)
|
||||
#define FP5_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x2, x6, x0, x4, x3, x1, x5)
|
||||
#define FP5_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x1, x5, x3, x2, x0, x4, x6)
|
||||
#define FP5_5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F5_16W(x2, x5, x0, x6, x4, x3, x1)
|
||||
|
||||
#define STEP_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), \
|
||||
_mm512_add_epi32( w, v512_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_16W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP1_16W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1_16W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1_16W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1_16W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1_16W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1_16W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1_16W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1_16W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASSG_16W(p, n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_16W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(MP ## p[pass_count + 0]), \
|
||||
RK ## p[pass_count + 0]); \
|
||||
STEP_16W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(MP ## p[pass_count + 1]), \
|
||||
RK ## p[pass_count + 1]); \
|
||||
STEP_16W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(MP ## p[pass_count + 2]), \
|
||||
RK ## p[pass_count + 2]); \
|
||||
STEP_16W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(MP ## p[pass_count + 3]), \
|
||||
RK ## p[pass_count + 3]); \
|
||||
STEP_16W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(MP ## p[pass_count + 4]), \
|
||||
RK ## p[pass_count + 4]); \
|
||||
STEP_16W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(MP ## p[pass_count + 5]), \
|
||||
RK ## p[pass_count + 5]); \
|
||||
STEP_16W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(MP ## p[pass_count + 6]), \
|
||||
RK ## p[pass_count + 6]); \
|
||||
STEP_16W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(MP ## p[pass_count + 7]), \
|
||||
RK ## p[pass_count + 7]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASS2_16W(n, in) PASSG_16W(2, n, in)
|
||||
#define PASS3_16W(n, in) PASSG_16W(3, n, in)
|
||||
#define PASS4_16W(n, in) PASSG_16W(4, n, in)
|
||||
#define PASS5_16W(n, in) PASSG_16W(5, n, in)
|
||||
|
||||
#define SAVE_STATE_16W \
|
||||
__m512i u0, u1, u2, u3, u4, u5, u6, u7; \
|
||||
do { \
|
||||
u0 = s0; \
|
||||
u1 = s1; \
|
||||
u2 = s2; \
|
||||
u3 = s3; \
|
||||
u4 = s4; \
|
||||
u5 = s5; \
|
||||
u6 = s6; \
|
||||
u7 = s7; \
|
||||
} while (0)
|
||||
|
||||
#define UPDATE_STATE_16W \
|
||||
do { \
|
||||
s0 = _mm512_add_epi32( s0, u0 ); \
|
||||
s1 = _mm512_add_epi32( s1, u1 ); \
|
||||
s2 = _mm512_add_epi32( s2, u2 ); \
|
||||
s3 = _mm512_add_epi32( s3, u3 ); \
|
||||
s4 = _mm512_add_epi32( s4, u4 ); \
|
||||
s5 = _mm512_add_epi32( s5, u5 ); \
|
||||
s6 = _mm512_add_epi32( s6, u6 ); \
|
||||
s7 = _mm512_add_epi32( s7, u7 ); \
|
||||
} while (0)
|
||||
|
||||
#define CORE_16W5(in) do { \
|
||||
SAVE_STATE_16W; \
|
||||
PASS1_16W(5, in); \
|
||||
PASS2_16W(5, in); \
|
||||
PASS3_16W(5, in); \
|
||||
PASS4_16W(5, in); \
|
||||
PASS5_16W(5, in); \
|
||||
UPDATE_STATE_16W; \
|
||||
} while (0)
|
||||
|
||||
#define DSTATE_16W __m512i s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
#define RSTATE_16W \
|
||||
do { \
|
||||
s0 = sc->s0; \
|
||||
s1 = sc->s1; \
|
||||
s2 = sc->s2; \
|
||||
s3 = sc->s3; \
|
||||
s4 = sc->s4; \
|
||||
s5 = sc->s5; \
|
||||
s6 = sc->s6; \
|
||||
s7 = sc->s7; \
|
||||
} while (0)
|
||||
|
||||
#define WSTATE_16W \
|
||||
do { \
|
||||
sc->s0 = s0; \
|
||||
sc->s1 = s1; \
|
||||
sc->s2 = s2; \
|
||||
sc->s3 = s3; \
|
||||
sc->s4 = s4; \
|
||||
sc->s5 = s5; \
|
||||
sc->s6 = s6; \
|
||||
sc->s7 = s7; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
haval_16way_init( haval_16way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = v512_32( 0x243F6A88UL );
|
||||
sc->s1 = v512_32( 0x85A308D3UL );
|
||||
sc->s2 = v512_32( 0x13198A2EUL );
|
||||
sc->s3 = v512_32( 0x03707344UL );
|
||||
sc->s4 = v512_32( 0xA4093822UL );
|
||||
sc->s5 = v512_32( 0x299F31D0UL );
|
||||
sc->s6 = v512_32( 0x082EFA98UL );
|
||||
sc->s7 = v512_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
sc->count_low = 0;
|
||||
|
||||
}
|
||||
#define IN_PREPARE_16W(indata) const __m512i *const load_ptr_16w = (indata)
|
||||
|
||||
#define INW_16W(i) load_ptr_16w[ i ]
|
||||
|
||||
static void
|
||||
haval_16way_out( haval_16way_context *sc, void *dst )
|
||||
{
|
||||
__m512i *buf = (__m512i*)dst;
|
||||
DSTATE_16W;
|
||||
RSTATE_16W;
|
||||
|
||||
buf[0] = s0;
|
||||
buf[1] = s1;
|
||||
buf[2] = s2;
|
||||
buf[3] = s3;
|
||||
buf[4] = s4;
|
||||
buf[5] = s5;
|
||||
buf[6] = s6;
|
||||
buf[7] = s7;
|
||||
}
|
||||
|
||||
#undef PASSES
|
||||
#define PASSES 5
|
||||
#include "haval-16way-helper.c"
|
||||
|
||||
#define API_16W(xxx, y) \
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_init(void *cc) \
|
||||
{ \
|
||||
haval_16way_init(cc, xxx >> 5, y); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _16way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_close(void *cc, void *dst) \
|
||||
{ \
|
||||
haval ## y ## _16way_close(cc, dst); \
|
||||
} \
|
||||
|
||||
API_16W(256, 5)
|
||||
|
||||
#define RVAL_16W \
|
||||
do { \
|
||||
s0 = val[0]; \
|
||||
s1 = val[1]; \
|
||||
s2 = val[2]; \
|
||||
s3 = val[3]; \
|
||||
s4 = val[4]; \
|
||||
s5 = val[5]; \
|
||||
s6 = val[6]; \
|
||||
s7 = val[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WVAL_16W \
|
||||
do { \
|
||||
val[0] = s0; \
|
||||
val[1] = s1; \
|
||||
val[2] = s2; \
|
||||
val[3] = s3; \
|
||||
val[4] = s4; \
|
||||
val[5] = s5; \
|
||||
val[6] = s6; \
|
||||
val[7] = s7; \
|
||||
} while (0)
|
||||
|
||||
#define INMSG_16W(i) msg[i]
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -107,6 +107,25 @@ void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[32];
|
||||
__m512i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_16way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef haval_16way_context haval256_5_16way_context;
|
||||
|
||||
void haval256_5_16way_init( void *cc );
|
||||
|
||||
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_16way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(JHA_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -33,8 +33,7 @@
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, \
|
||||
_mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
|
@@ -5,8 +5,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
//#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
__thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
@@ -2,8 +2,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -98,7 +98,7 @@ do { \
|
||||
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_4W do { \
|
||||
a0 = _mm_xor_si128( g0, m128_one_32 ); \
|
||||
a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
|
||||
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
|
||||
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
|
||||
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
|
||||
@@ -268,7 +268,7 @@ panama_4way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m128i*)( sc->data + current ) = m128_one_32;
|
||||
*(__m128i*)( sc->data + current ) = v128_32( 1 );
|
||||
current++;
|
||||
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
|
||||
panama_4way_push( sc, sc->data, 1 );
|
||||
@@ -354,7 +354,7 @@ do { \
|
||||
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
a0 = _mm256_xor_si256( g0, v256_32( 1 ) ); \
|
||||
a1 = _mm256_xor_si256( g1, INW2( 0 ) ); \
|
||||
a2 = _mm256_xor_si256( g2, INW2( 1 ) ); \
|
||||
a3 = _mm256_xor_si256( g3, INW2( 2 ) ); \
|
||||
@@ -521,7 +521,7 @@ panama_8way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m256i*)( sc->data + current ) = m256_one_32;
|
||||
*(__m256i*)( sc->data + current ) = v256_32( 1 );
|
||||
current++;
|
||||
memset_zero_256( (__m256i*)sc->data + current, 32 - current );
|
||||
panama_8way_push( sc, sc->data, 1 );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "hmq1725-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -6,13 +6,12 @@
|
||||
#include "sha256-hash.h"
|
||||
#include "compat.h"
|
||||
|
||||
/*
|
||||
static const uint32_t H256[8] =
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i T1, T2; \
|
||||
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m128i K = v128_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[58] ),
|
||||
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
|
||||
B = _mm_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
|
||||
T1 = _mm_add_epi32( v128_32( K256[59] ),
|
||||
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
|
||||
A = _mm_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
|
||||
T2 = _mm_add_epi32( v128_32( K256[60] ),
|
||||
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
|
||||
H = _mm_add_epi32( H, T2 );
|
||||
|
||||
targ = _mm_set1_epi32( target[7] );
|
||||
targ = v128_32( target[7] );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
|
||||
|
||||
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
|
||||
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[61] ),
|
||||
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
|
||||
G = _mm_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
|
||||
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
|
||||
|
||||
if ( ( 0 != ( t6_mask & mm128_movmask_32(
|
||||
@@ -440,14 +439,14 @@ return 1;
|
||||
void sha256_4way_init( sha256_4way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v128_32( sha256_iv[0] );
|
||||
sc->val[1] = v128_32( sha256_iv[1] );
|
||||
sc->val[2] = v128_32( sha256_iv[2] );
|
||||
sc->val[3] = v128_32( sha256_iv[3] );
|
||||
sc->val[4] = v128_32( sha256_iv[4] );
|
||||
sc->val[5] = v128_32( sha256_iv[5] );
|
||||
sc->val[6] = v128_32( sha256_iv[6] );
|
||||
sc->val[7] = v128_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
|
||||
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -614,7 +613,7 @@ do { \
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
@@ -634,7 +633,7 @@ do { \
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
@@ -643,7 +642,7 @@ do { \
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
||||
@@ -666,7 +665,7 @@ do { \
|
||||
|
||||
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
|
||||
W[ i0 ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -677,7 +676,7 @@ do { \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
\
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
|
||||
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
|
||||
W[ (i1) ] ); \
|
||||
T1 = BSG2_1x( D ); \
|
||||
T2 = BSG2_0x( H ); \
|
||||
@@ -790,7 +789,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
|
||||
_mm256_set1_epi32( K256[3] ) );
|
||||
v256_32( K256[3] ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
|
||||
MAJx(F, G, H) ) );
|
||||
@@ -910,12 +909,11 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
int flip;
|
||||
int t6_mask;
|
||||
__m256i vmask, targ, hash;
|
||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
uint8_t flip, t6_mask;
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in+1 );
|
||||
@@ -991,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// round 58 to 60 part 1
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[58] ),
|
||||
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
|
||||
B = _mm256_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
|
||||
T1 = _mm256_add_epi32( v256_32( K256[59] ),
|
||||
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
|
||||
T2 = _mm256_add_epi32( v256_32( K256[60] ),
|
||||
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
|
||||
H = _mm256_add_epi32( H, T2 );
|
||||
|
||||
// Got H, test it.
|
||||
targ = _mm256_set1_epi32( target[7] );
|
||||
targ = v256_32( target[7] );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
if ( target[7] )
|
||||
{
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
||||
|
||||
// round 58 part 2
|
||||
@@ -1018,14 +1018,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
MAJx( G, H, A ) ) );
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[61] ),
|
||||
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
|
||||
G = _mm256_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
|
||||
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
||||
|
||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
||||
@@ -1078,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_init( sha256_8way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v256_32( sha256_iv[0] );
|
||||
sc->val[1] = v256_32( sha256_iv[1] );
|
||||
sc->val[2] = v256_32( sha256_iv[2] );
|
||||
sc->val[3] = v256_32( sha256_iv[3] );
|
||||
sc->val[4] = v256_32( sha256_iv[4] );
|
||||
sc->val[5] = v256_32( sha256_iv[5] );
|
||||
sc->val[6] = v256_32( sha256_iv[6] );
|
||||
sc->val[7] = v256_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
// need to handle odd byte length for yespower.
|
||||
@@ -1131,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1147,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
@@ -1210,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T1 = BSG2_1x16( E ); \
|
||||
__m512i T2 = BSG2_0x16( A ); \
|
||||
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
|
||||
@@ -1224,7 +1224,7 @@ do { \
|
||||
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
||||
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
v512_32( K256[(i)+(j)] ) ); \
|
||||
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
@@ -1234,7 +1234,7 @@ do { \
|
||||
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m512i K = v512_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
@@ -1345,7 +1345,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[3] ) );
|
||||
v512_32( K256[3] ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
|
||||
MAJx16(F, G, H) ) );
|
||||
@@ -1566,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// rounds 58 to 60 part 1
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[58] ),
|
||||
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
|
||||
B = _mm512_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
|
||||
T1 = _mm512_add_epi32( v512_32( K256[59] ),
|
||||
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
|
||||
T2 = _mm512_add_epi32( v512_32( K256[60] ),
|
||||
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
|
||||
H = _mm512_add_epi32( H, T2 );
|
||||
|
||||
// got H, test it against target[7]
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[7] );
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = v512_32( target[7] );
|
||||
if ( target[7] )
|
||||
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
|
||||
return 0;
|
||||
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
|
||||
@@ -1591,15 +1592,15 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[61] ),
|
||||
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
|
||||
G = _mm512_add_epi32( G, T0 );
|
||||
|
||||
// got G, test it against target[6] if indicated
|
||||
if ( t6_mask != 0 )
|
||||
if ( (uint16_t)t6_mask )
|
||||
{
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[6] );
|
||||
targ = v512_32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
||||
return 0;
|
||||
}
|
||||
@@ -1637,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v512_32( sha256_iv[0] );
|
||||
sc->val[1] = v512_32( sha256_iv[1] );
|
||||
sc->val[2] = v512_32( sha256_iv[2] );
|
||||
sc->val[3] = v512_32( sha256_iv[3] );
|
||||
sc->val[4] = v512_32( sha256_iv[4] );
|
||||
sc->val[5] = v512_32( sha256_iv[5] );
|
||||
sc->val[6] = v512_32( sha256_iv[6] );
|
||||
sc->val[7] = v512_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
@@ -1688,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1704,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -118,10 +118,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -130,42 +130,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -216,33 +216,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 );
|
||||
vdata[16+15] = v256_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 );
|
||||
block[15] = v256_32( 32*8 );
|
||||
|
||||
// initialize state for second hash
|
||||
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -298,31 +298,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 );
|
||||
vdata[16+15] = v128_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 );
|
||||
block[15] = v128_32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
@@ -51,8 +51,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 3] = block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
@@ -128,10 +127,10 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -140,42 +139,42 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256dt_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
|
||||
buf[15] = v512_32( 0x480 ); // sha256dt funky bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
|
||||
istate[0] = v512_32( sha256dt_iv[0] );
|
||||
istate[1] = v512_32( sha256dt_iv[1] );
|
||||
istate[2] = v512_32( sha256dt_iv[2] );
|
||||
istate[3] = v512_32( sha256dt_iv[3] );
|
||||
istate[4] = v512_32( sha256dt_iv[4] );
|
||||
istate[5] = v512_32( sha256dt_iv[5] );
|
||||
istate[6] = v512_32( sha256dt_iv[6] );
|
||||
istate[7] = v512_32( sha256dt_iv[7] );
|
||||
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
|
||||
block[15] = v512_32( 0x300 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -226,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v256_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
block[15] = v256_32( 0x300 );
|
||||
|
||||
// initialize state for swecond hash
|
||||
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
// initialize state for second hash
|
||||
istate[0] = v256_32( sha256dt_iv[0] );
|
||||
istate[1] = v256_32( sha256dt_iv[1] );
|
||||
istate[2] = v256_32( sha256dt_iv[2] );
|
||||
istate[3] = v256_32( sha256dt_iv[3] );
|
||||
istate[4] = v256_32( sha256dt_iv[4] );
|
||||
istate[5] = v256_32( sha256dt_iv[5] );
|
||||
istate[6] = v256_32( sha256dt_iv[6] );
|
||||
istate[7] = v256_32( sha256dt_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -308,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v128_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 0x300 );
|
||||
block[15] = v128_32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = v128_32( sha256dt_iv[0] );
|
||||
initstate[1] = v128_32( sha256dt_iv[1] );
|
||||
initstate[2] = v128_32( sha256dt_iv[2] );
|
||||
initstate[3] = v128_32( sha256dt_iv[3] );
|
||||
initstate[4] = v128_32( sha256dt_iv[4] );
|
||||
initstate[5] = v128_32( sha256dt_iv[5] );
|
||||
initstate[6] = v128_32( sha256dt_iv[6] );
|
||||
initstate[7] = v128_32( sha256dt_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
@@ -342,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
|
||||
// {
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
// }
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
@@ -30,10 +30,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -42,42 +42,42 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd & 3rd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -222,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v256_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v256_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -313,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v128_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v128_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
@@ -39,57 +39,429 @@
|
||||
/*
|
||||
static const uit64_t H512[8] =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint64_t K512[80] =
|
||||
{
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD,
|
||||
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019,
|
||||
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE,
|
||||
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
|
||||
0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
|
||||
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
|
||||
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
|
||||
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
|
||||
0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
|
||||
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
|
||||
0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
|
||||
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910,
|
||||
0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
|
||||
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
|
||||
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
|
||||
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
|
||||
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
|
||||
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
|
||||
0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493,
|
||||
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
|
||||
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
};
|
||||
|
||||
#if defined(__AVX2__) && defined(__SHA512__)
|
||||
|
||||
// SHA-512 implemented using SHA512 CPU extension.
|
||||
|
||||
// Experimental. Not tested. Not reviewed. Compile tested only.
|
||||
|
||||
// Needs GCC-13 for compilation.
|
||||
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
|
||||
// Modelled after noloader sha256 implementation.
|
||||
|
||||
// It's not clear how SHA512 will be supported before AVX10 considering how
|
||||
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
|
||||
// until AVX10-256.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
|
||||
|
||||
#else
|
||||
// Ugly workaround to make it work with AVX2
|
||||
|
||||
static const __m256i mask __attribute__ ((aligned (32)))
|
||||
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) \
|
||||
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
|
||||
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
|
||||
|
||||
#endif
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 ) )
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
|
||||
|
||||
// Save initial state
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
|
||||
|
||||
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
|
||||
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
|
||||
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
|
||||
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
|
||||
}
|
||||
|
||||
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 ); // CDGH
|
||||
|
||||
// Save initial state
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
|
||||
|
||||
TMP = _mm256_permute4x64_epi64( STATE0, 0x1B ); // FEBA
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 ); // DCHG
|
||||
STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 ); // DCBA
|
||||
STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm256_store_si256((__m256i*) &state_out[0], STATE0 );
|
||||
_mm256_store_si256((__m256i*) &state_out[4], STATE1 );
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -117,7 +489,7 @@ static const uint64_t K512[80] =
|
||||
|
||||
#define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
|
||||
__m512i T0 = _mm512_add_epi64( v512_64( K512[i] ), W[ i ] ); \
|
||||
__m512i T1 = BSG8W_5_1( E ); \
|
||||
__m512i T2 = BSG8W_5_0( A ); \
|
||||
T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
|
||||
@@ -155,14 +527,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
A = v512_64( 0x6A09E667F3BCC908 );
|
||||
B = v512_64( 0xBB67AE8584CAA73B );
|
||||
C = v512_64( 0x3C6EF372FE94F82B );
|
||||
D = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
E = v512_64( 0x510E527FADE682D1 );
|
||||
F = v512_64( 0x9B05688C2B3E6C1F );
|
||||
G = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
H = v512_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
@@ -191,14 +563,14 @@ sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm512_add_epi64( A, v512_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, v512_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, v512_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, v512_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, v512_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, v512_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, v512_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, v512_64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,7 +615,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
@@ -255,9 +627,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
|
||||
v512_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
|
||||
v512_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_8way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_64( dst, sc->val );
|
||||
@@ -295,7 +667,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
@@ -317,7 +689,7 @@ do { \
|
||||
|
||||
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
|
||||
__m256i T1 = BSG5_1( E ); \
|
||||
__m256i T2 = BSG5_0( A ); \
|
||||
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
|
||||
@@ -364,14 +736,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
}
|
||||
else
|
||||
{
|
||||
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
A = v256_64( 0x6A09E667F3BCC908 );
|
||||
B = v256_64( 0xBB67AE8584CAA73B );
|
||||
C = v256_64( 0x3C6EF372FE94F82B );
|
||||
D = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
E = v256_64( 0x510E527FADE682D1 );
|
||||
F = v256_64( 0x9B05688C2B3E6C1F );
|
||||
G = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
H = v256_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
@@ -405,14 +777,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
|
||||
r[0] = _mm256_add_epi64( A, v256_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm256_add_epi64( B, v256_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm256_add_epi64( C, v256_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm256_add_epi64( D, v256_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm256_add_epi64( E, v256_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm256_add_epi64( F, v256_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm256_add_epi64( G, v256_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm256_add_epi64( H, v256_64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -457,7 +829,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
|
||||
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
@@ -469,9 +841,9 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
|
||||
v256_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
|
||||
_mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
|
||||
v256_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm256_block_bswap_64( dst, sc->val );
|
||||
|
@@ -5,11 +5,32 @@
|
||||
#include "simd-utils.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#if defined(__SHA512__) && defined(__AVX2__)
|
||||
|
||||
// Experimental, untested
|
||||
// Need to substitute for sph_sha512
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t buf[128>>3];
|
||||
uint64_t val[8];
|
||||
uint64_t count;
|
||||
} sha512_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in );
|
||||
|
||||
void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
@@ -28,7 +49,8 @@ void sha512_8way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[128>>3];
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
|
@@ -16,14 +16,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = v512_64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = v512_64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = v512_64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = v512_64( 0x963877195940EABD );
|
||||
ctx->val[4] = v512_64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = v512_64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = v512_64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = v512_64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -43,7 +43,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
|
||||
const __m512i eight = v512_64( 0x0000000800000000 );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
@@ -84,14 +84,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
|
||||
ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
|
||||
ctx->val[0] = v256_64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = v256_64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = v256_64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = v256_64( 0x963877195940EABD );
|
||||
ctx->val[4] = v256_64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = v256_64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = v256_64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = v256_64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -111,7 +111,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
|
||||
const __m256i four = v256_64( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -10,6 +10,33 @@
|
||||
|
||||
#define SPH_SIZE_shabal512 512
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i A[12], B[16], C[16];
|
||||
uint32_t Whigh, Wlow;
|
||||
size_t ptr;
|
||||
bool state_loaded;
|
||||
} shabal_16way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef shabal_16way_context shabal256_16way_context;
|
||||
typedef shabal_16way_context shabal512_16way_context;
|
||||
|
||||
void shabal256_16way_init( void *cc );
|
||||
void shabal256_16way_update( void *cc, const void *data, size_t len );
|
||||
void shabal256_16way_close( void *cc, void *dst );
|
||||
void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
void shabal512_16way_init( void *cc );
|
||||
void shabal512_16way_update( void *cc, const void *data, size_t len );
|
||||
void shabal512_16way_close( void *cc, void *dst );
|
||||
void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#include "c11-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#include "x11-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <compat/portable_endian.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#include "x11gost-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -41,9 +41,9 @@ int hex_hash( void* output, const void* input, int thrid )
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
sph_blake512_init( &ctx.blake );
|
||||
sph_blake512( &ctx.blake, in, size );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
blake512_init( &ctx.blake );
|
||||
blake512_update( &ctx.blake, in, size );
|
||||
blake512_close( &ctx.blake, hash );
|
||||
break;
|
||||
case BMW:
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
@@ -50,7 +50,7 @@ struct TortureGarden
|
||||
sph_groestl512_context groestl;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_blake512_context blake;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
@@ -80,9 +80,9 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
switch ( algo )
|
||||
{
|
||||
case 0:
|
||||
sph_blake512_init(&garden->blake);
|
||||
sph_blake512(&garden->blake, input, 64);
|
||||
sph_blake512_close(&garden->blake, hash);
|
||||
blake512_init(&garden->blake);
|
||||
blake512_update(&garden->blake, input, 64);
|
||||
blake512_close(&garden->blake, hash);
|
||||
break;
|
||||
case 1:
|
||||
sph_bmw512_init(&garden->bmw);
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include "simd-utils.h"
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
@@ -30,7 +30,6 @@
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -199,7 +198,7 @@ union _x16r_context_overlay
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_blake512_context blake;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
|
@@ -64,9 +64,9 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
sph_blake512_init( &ctx.blake );
|
||||
sph_blake512( &ctx.blake, in, size );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
blake512_init( &ctx.blake );
|
||||
blake512_update( &ctx.blake, in, size );
|
||||
blake512_close( &ctx.blake, hash );
|
||||
break;
|
||||
case BMW:
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
@@ -219,13 +219,13 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
if ( s_ntime != pdata[17] )
|
||||
uint32_t ntime = bswap_32( pdata[17] );
|
||||
if ( s_ntime != ntime )
|
||||
{
|
||||
uint32_t ntime = swab32(pdata[17]);
|
||||
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
|
||||
s_ntime = ntime;
|
||||
if ( opt_debug && !thr_id )
|
||||
applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
if ( !opt_quiet && !thr_id )
|
||||
applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
|
||||
}
|
||||
|
||||
x16r_prehash( edata, pdata );
|
||||
|
@@ -21,7 +21,7 @@ union _x16rv2_context_overlay
|
||||
sph_echo512_context echo;
|
||||
sph_fugue512_context fugue;
|
||||
#endif
|
||||
sph_blake512_context blake;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
@@ -58,9 +58,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
sph_blake512_init( &ctx.blake );
|
||||
sph_blake512( &ctx.blake, in, size );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
blake512_init( &ctx.blake );
|
||||
blake512_update( &ctx.blake, in, size );
|
||||
blake512_close( &ctx.blake, hash );
|
||||
break;
|
||||
case BMW:
|
||||
sph_bmw512_init( &ctx.bmw );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -27,6 +27,431 @@
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
|
||||
#if defined(X17_16X32)
|
||||
|
||||
union _x17_16way_context_overlay
|
||||
{
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_2buf_context cube;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
// hamsi512_16x32_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_16way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sha512_8way_context sha512;
|
||||
haval256_5_16way_context haval;
|
||||
} __attribute__ ((aligned (64)));
|
||||
typedef union _x17_16way_context_overlay x17_16way_context_overlay;
|
||||
|
||||
static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64)));
|
||||
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
|
||||
|
||||
int x17_16way_hash( void *state, const __m512i nonceA, const __m512i nonceB,
|
||||
int thr_id )
|
||||
{
|
||||
uint64_t vhashA[8*16] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashC[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashD[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t hash00[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash01[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash02[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash03[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash04[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash05[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash06[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash07[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash08[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash09[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash10[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash11[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash12[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash13[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint64_t hash15[8] __attribute__ ((aligned (32)));
|
||||
x17_16way_context_overlay ctx;
|
||||
|
||||
memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) );
|
||||
blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA,
|
||||
x17_16way_midstate );
|
||||
blake512_8way_final_le( &ctx.blake, vhashB, nonceB,
|
||||
x17_16way_midstate );
|
||||
|
||||
bmw512_8way_full( &ctx.bmw, vhashA, vhashA, 64 );
|
||||
bmw512_8way_full( &ctx.bmw, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashC, vhashC, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashD, vhashD, 64 );
|
||||
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
|
||||
|
||||
rintrlv_8x64_4x128( vhashC, vhashD, vhashB, 512 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashC, vhashC, 64 );
|
||||
groestl512_4way_full( &ctx.groestl, vhashD, vhashD, 64 );
|
||||
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07, vhashA );
|
||||
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhashB );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash00, (char*)hash00, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash01, (char*)hash01, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash02, (char*)hash02, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash03, (char*)hash03, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash04, (char*)hash04, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash05, (char*)hash05, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash06, (char*)hash06, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash07, (char*)hash07, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash08, (char*)hash08, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash09, (char*)hash09, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash10, (char*)hash10, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash11, (char*)hash11, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash12, (char*)hash12, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash13, (char*)hash13, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash14, (char*)hash14, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash15, (char*)hash15, 512 );
|
||||
|
||||
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07 );
|
||||
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 );
|
||||
skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 );
|
||||
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhashA, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhashA );
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhashB, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhashB );
|
||||
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhashA, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhashA );
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhashB, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhashB );
|
||||
|
||||
//
|
||||
rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
|
||||
|
||||
luffa512_4way_full( &ctx.luffa, vhashC, vhashC, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashD, vhashD, 64 );
|
||||
|
||||
cube_4way_2buf_full( &ctx.cube, vhashC, vhashD, 512, vhashC, vhashD, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_full( &ctx.shavite, vhashC, vhashC, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashD, vhashD, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash00, hash01, hash02, hash03, vhashC );
|
||||
dintrlv_4x128_512( hash04, hash05, hash06, hash07, vhashD );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash00, hash00, 64 );
|
||||
shavite512_full( &ctx.shavite, hash01, hash01, 64 );
|
||||
shavite512_full( &ctx.shavite, hash02, hash02, 64 );
|
||||
shavite512_full( &ctx.shavite, hash03, hash03, 64 );
|
||||
shavite512_full( &ctx.shavite, hash04, hash04, 64 );
|
||||
shavite512_full( &ctx.shavite, hash05, hash05, 64 );
|
||||
shavite512_full( &ctx.shavite, hash06, hash06, 64 );
|
||||
shavite512_full( &ctx.shavite, hash07, hash07, 64 );
|
||||
|
||||
intrlv_4x128_512( vhashC, hash00, hash01, hash02, hash03 );
|
||||
intrlv_4x128_512( vhashD, hash04, hash05, hash06, hash07 );
|
||||
|
||||
#endif
|
||||
|
||||
simd512_4way_full( &ctx.simd, vhashC, vhashC, 64 );
|
||||
simd512_4way_full( &ctx.simd, vhashD, vhashD, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_full( &ctx.echo, vhashC, 512, vhashC, 64 );
|
||||
echo_4way_full( &ctx.echo, vhashD, 512, vhashD, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhashA, vhashC, vhashD, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash00, hash01, hash02, hash03, vhashC );
|
||||
dintrlv_4x128_512( hash04, hash05, hash06, hash07, vhashD );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash00, 512,
|
||||
(const BitSequence *)hash00, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash01, 512,
|
||||
(const BitSequence *)hash01, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash02, 512,
|
||||
(const BitSequence *)hash02, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash03, 512,
|
||||
(const BitSequence *)hash03, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash04, 512,
|
||||
(const BitSequence *)hash04, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash05, 512,
|
||||
(const BitSequence *)hash05, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash06, 512,
|
||||
(const BitSequence *)hash06, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash07, 512,
|
||||
(const BitSequence *)hash07, 64 );
|
||||
|
||||
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07 );
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
rintrlv_8x64_4x128( vhashC, vhashD, vhashB, 512 );
|
||||
|
||||
luffa512_4way_full( &ctx.luffa, vhashC, vhashC, 64 );
|
||||
luffa512_4way_full( &ctx.luffa, vhashD, vhashD, 64 );
|
||||
|
||||
cube_4way_2buf_full( &ctx.cube, vhashC, vhashD, 512, vhashC, vhashD, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_full( &ctx.shavite, vhashC, vhashC, 64 );
|
||||
shavite512_4way_full( &ctx.shavite, vhashD, vhashD, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash08, hash09, hash10, hash11, vhashC );
|
||||
dintrlv_4x128_512( hash12, hash13, hash14, hash15, vhashD );
|
||||
|
||||
shavite512_full( &ctx.shavite, hash08, hash08, 64 );
|
||||
shavite512_full( &ctx.shavite, hash09, hash09, 64 );
|
||||
shavite512_full( &ctx.shavite, hash10, hash10, 64 );
|
||||
shavite512_full( &ctx.shavite, hash11, hash11, 64 );
|
||||
shavite512_full( &ctx.shavite, hash12, hash12, 64 );
|
||||
shavite512_full( &ctx.shavite, hash13, hash13, 64 );
|
||||
shavite512_full( &ctx.shavite, hash14, hash14, 64 );
|
||||
shavite512_full( &ctx.shavite, hash15, hash15, 64 );
|
||||
|
||||
intrlv_4x128_512( vhashC, hash08, hash09, hash10, hash11 );
|
||||
intrlv_4x128_512( vhashD, hash12, hash13, hash14, hash15 );
|
||||
|
||||
#endif
|
||||
|
||||
simd512_4way_full( &ctx.simd, vhashC, vhashC, 64 );
|
||||
simd512_4way_full( &ctx.simd, vhashD, vhashD, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_full( &ctx.echo, vhashC, 512, vhashC, 64 );
|
||||
echo_4way_full( &ctx.echo, vhashD, 512, vhashD, 64 );
|
||||
|
||||
rintrlv_4x128_8x64( vhashB, vhashC, vhashD, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash08, hash09, hash10, hash11, vhashC );
|
||||
dintrlv_4x128_512( hash12, hash13, hash14, hash15, vhashD );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash08, 512,
|
||||
(const BitSequence *)hash08, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash09, 512,
|
||||
(const BitSequence *)hash09, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash10, 512,
|
||||
(const BitSequence *)hash10, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash11, 512,
|
||||
(const BitSequence *)hash11, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash12, 512,
|
||||
(const BitSequence *)hash12, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash13, 512,
|
||||
(const BitSequence *)hash13, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash14, 512,
|
||||
(const BitSequence *)hash14, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash15, 512,
|
||||
(const BitSequence *)hash15, 64 );
|
||||
|
||||
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15 );
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
/*
|
||||
intrlv_16x32( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07,
|
||||
hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, 512 );
|
||||
hamsi512_16x32_full( &ctx.hamsi, vhashA, vhashA, 64 );
|
||||
dintrlv_16x32( hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07,
|
||||
hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhashA, 512 );
|
||||
*/
|
||||
|
||||
|
||||
hamsi512_8way_init( &ctx.hamsi );
|
||||
hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
|
||||
hamsi512_8way_close( &ctx.hamsi, vhashA );
|
||||
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07, vhashA );
|
||||
hamsi512_8way_init( &ctx.hamsi );
|
||||
hamsi512_8way_update( &ctx.hamsi, vhashB, 64 );
|
||||
hamsi512_8way_close( &ctx.hamsi, vhashB );
|
||||
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhashB );
|
||||
|
||||
|
||||
fugue512_full( &ctx.fugue, hash00, hash00, 64 );
|
||||
fugue512_full( &ctx.fugue, hash01, hash01, 64 );
|
||||
fugue512_full( &ctx.fugue, hash02, hash02, 64 );
|
||||
fugue512_full( &ctx.fugue, hash03, hash03, 64 );
|
||||
fugue512_full( &ctx.fugue, hash04, hash04, 64 );
|
||||
fugue512_full( &ctx.fugue, hash05, hash05, 64 );
|
||||
fugue512_full( &ctx.fugue, hash06, hash06, 64 );
|
||||
fugue512_full( &ctx.fugue, hash07, hash07, 64 );
|
||||
fugue512_full( &ctx.fugue, hash08, hash08, 64 );
|
||||
fugue512_full( &ctx.fugue, hash09, hash09, 64 );
|
||||
fugue512_full( &ctx.fugue, hash10, hash10, 64 );
|
||||
fugue512_full( &ctx.fugue, hash11, hash11, 64 );
|
||||
fugue512_full( &ctx.fugue, hash12, hash12, 64 );
|
||||
fugue512_full( &ctx.fugue, hash13, hash13, 64 );
|
||||
fugue512_full( &ctx.fugue, hash14, hash14, 64 );
|
||||
fugue512_full( &ctx.fugue, hash15, hash15, 64 );
|
||||
|
||||
intrlv_16x32_512( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07,
|
||||
hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15 );
|
||||
|
||||
shabal512_16way_init( &ctx.shabal );
|
||||
shabal512_16way_update( &ctx.shabal, vhashA, 64 );
|
||||
shabal512_16way_close( &ctx.shabal, vhashA );
|
||||
|
||||
dintrlv_16x32_512( hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07,
|
||||
hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhashA );
|
||||
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash00, hash00, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash01, hash01, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash02, hash02, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash03, hash03, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash04, hash04, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash05, hash05, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash06, hash06, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash07, hash07, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash08, hash08, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash09, hash09, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash10, hash10, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash11, hash11, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash12, hash12, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash13, hash13, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash14, hash14, 64 );
|
||||
sph_whirlpool512_full( &ctx.whirlpool, hash15, hash15, 64 );
|
||||
|
||||
intrlv_8x64_512( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07 );
|
||||
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15 );
|
||||
|
||||
sha512_8way_init( &ctx.sha512 );
|
||||
sha512_8way_update( &ctx.sha512, vhashA, 64 );
|
||||
sha512_8way_close( &ctx.sha512, vhashA );
|
||||
sha512_8way_init( &ctx.sha512 );
|
||||
sha512_8way_update( &ctx.sha512, vhashB, 64 );
|
||||
sha512_8way_close( &ctx.sha512, vhashB );
|
||||
|
||||
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07, vhashA );
|
||||
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhashB );
|
||||
intrlv_16x32_512( vhashA, hash00, hash01, hash02, hash03,
|
||||
hash04, hash05, hash06, hash07,
|
||||
hash08, hash09, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15 );
|
||||
|
||||
haval256_5_16way_init( &ctx.haval );
|
||||
haval256_5_16way_update( &ctx.haval, vhashA, 64 );
|
||||
haval256_5_16way_close( &ctx.haval, state );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x17_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
__m128i edata[5] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = &(hash32[7*16]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i nonceA, nonceB;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = v512_64( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
// convert LE32 to LE64
|
||||
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
|
||||
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
|
||||
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
|
||||
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
|
||||
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
|
||||
|
||||
mm512_intrlv80_8x64( vdata, edata );
|
||||
blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
|
||||
|
||||
nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ),
|
||||
_mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) );
|
||||
nonceB = _mm512_add_epi32( nonceA, v512_64( 8 ) );
|
||||
do
|
||||
{
|
||||
if ( likely( x17_16way_hash( hash32, nonceA, nonceB, thr_id ) ) )
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
nonceA = _mm512_add_epi32( nonceA, sixteen );
|
||||
nonceB = _mm512_add_epi32( nonceB, sixteen );
|
||||
n += 16;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(X17_8WAY)
|
||||
|
||||
union _x17_8way_context_overlay
|
||||
|
@@ -2,7 +2,10 @@
|
||||
|
||||
bool register_x17_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X17_8WAY)
|
||||
#if defined (X17_16X32)
|
||||
gate->scanhash = (void*)&scanhash_x17_16way;
|
||||
// gate->hash = (void*)&x17_16way_hash;
|
||||
#elif defined (X17_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x17_8way;
|
||||
gate->hash = (void*)&x17_8way_hash;
|
||||
#elif defined (X17_4WAY)
|
||||
|
@@ -6,13 +6,21 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X17_8WAY 1
|
||||
#define X17_16X32 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X17_4WAY 1
|
||||
#define X17_8X32 1
|
||||
#endif
|
||||
|
||||
bool register_x17_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X17_8WAY)
|
||||
|
||||
#if defined(X17_8WAY) || defined(X17_16X32)
|
||||
|
||||
int scanhash_x17_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
//int x17_16way_hash( void *state, const void *input, int thr_id );
|
||||
|
||||
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
@@ -32,8 +32,8 @@
|
||||
|
||||
union _x17_context_overlay
|
||||
{
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
@@ -60,13 +60,10 @@ typedef union _x17_context_overlay x17_context_overlay;
|
||||
|
||||
int x17_hash(void *output, const void *input, int thr_id )
|
||||
{
|
||||
// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
x17_context_overlay ctx;
|
||||
|
||||
sph_blake512_init(&ctx.blake);
|
||||
sph_blake512(&ctx.blake, input, 80);
|
||||
sph_blake512_close(&ctx.blake, hash);
|
||||
blake512_full( &ctx.blake, hash, input, 80 );
|
||||
|
||||
sph_bmw512_init(&ctx.bmw);
|
||||
sph_bmw512(&ctx.bmw, (const void*) hash, 64);
|
||||
@@ -95,19 +92,15 @@ int x17_hash(void *output, const void *input, int thr_id )
|
||||
luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
// 8 Cube
|
||||
cubehash_full( &ctx.cube, (byte*) hash, 512, (const byte*)hash, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
// 10 Simd
|
||||
simd_full( &ctx.simd, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 512 );
|
||||
|
||||
//11---echo---
|
||||
#if defined(__AES__)
|
||||
echo_full( &ctx.echo, (BitSequence *)hash, 512,
|
||||
(const BitSequence *)hash, 64 );
|
||||
@@ -117,25 +110,20 @@ int x17_hash(void *output, const void *input, int thr_id )
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
|
||||
// X13 algos
|
||||
// 12 Hamsi
|
||||
sph_hamsi512_init( &ctx.hamsi );
|
||||
sph_hamsi512( &ctx.hamsi, hash, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash );
|
||||
|
||||
// 13 Fugue
|
||||
#if defined(__AES__)
|
||||
fugue512_full( &ctx.fugue, hash, hash, 64 );
|
||||
#else
|
||||
sph_fugue512_full( &ctx.fugue, hash, hash, 64 );
|
||||
#endif
|
||||
|
||||
// X14 Shabal
|
||||
sph_shabal512_init( &ctx.shabal );
|
||||
sph_shabal512(&ctx.shabal, hash, 64);
|
||||
sph_shabal512_close( &ctx.shabal, hash );
|
||||
|
||||
// X15 Whirlpool
|
||||
sph_whirlpool_init( &ctx.whirlpool );
|
||||
sph_whirlpool( &ctx.whirlpool, hash, 64 );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "x22i-gate.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
#if !( defined(X22I_8WAY) || defined(X22I_4WAY) )
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -33,8 +33,8 @@
|
||||
|
||||
union _x22i_context_overlay
|
||||
{
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
@@ -67,9 +67,9 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
unsigned char hash2[65] __attribute__((aligned(64))) = {0};
|
||||
x22i_context_overlay ctx;
|
||||
|
||||
sph_blake512_init(&ctx.blake);
|
||||
sph_blake512(&ctx.blake, input, 80);
|
||||
sph_blake512_close(&ctx.blake, hash);
|
||||
blake512_init(&ctx.blake);
|
||||
blake512_update(&ctx.blake, input, 80);
|
||||
blake512_close(&ctx.blake, hash);
|
||||
|
||||
sph_bmw512_init(&ctx.bmw);
|
||||
sph_bmw512(&ctx.bmw, (const void*) hash, 64);
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "x22i-gate.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/blake/blake2s-hash-4way.h"
|
||||
#include "algo/blake/blake2s-hash.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
#if !( defined(X25X_8WAY) || defined(X25X_4WAY) )
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
union _x25x_context_overlay
|
||||
{
|
||||
sph_blake512_context blake;
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
@@ -71,9 +71,9 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
unsigned char hash[25][64] __attribute__((aligned(64))) = {0};
|
||||
x25x_context_overlay ctx;
|
||||
|
||||
sph_blake512_init(&ctx.blake);
|
||||
sph_blake512(&ctx.blake, input, 80);
|
||||
sph_blake512_close(&ctx.blake, &hash[0] );
|
||||
blake512_init(&ctx.blake);
|
||||
blake512_update(&ctx.blake, input, 80);
|
||||
blake512_close(&ctx.blake, &hash[0] );
|
||||
|
||||
sph_bmw512_init(&ctx.bmw);
|
||||
sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
|
||||
|
@@ -5,6 +5,6 @@
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 --param=evrp-mode=legacy -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=native -Wall -D_WIN32_WINNT=0x0601" ./configure --with-curl
|
||||
make -j 4
|
||||
strip -s cpuminer
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.2.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.3.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.23.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.2'
|
||||
PACKAGE_VERSION='3.23.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.3'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.23.2
|
||||
cpuminer-opt configure 3.23.3
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.23.2, which was
|
||||
It was created by cpuminer-opt $as_me 3.23.3, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.23.2'
|
||||
VERSION='3.23.3'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.23.2, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.23.3, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.23.2
|
||||
cpuminer-opt config.status 3.23.3
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.23.2])
|
||||
AC_INIT([cpuminer-opt], [3.23.3])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
20
configure~
20
configure~
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.2.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.3.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.23.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.2'
|
||||
PACKAGE_VERSION='3.23.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.3'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.23.2 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.2:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.23.2
|
||||
cpuminer-opt configure 3.23.3
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.23.2, which was
|
||||
It was created by cpuminer-opt $as_me 3.23.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.23.2'
|
||||
VERSION='3.23.3'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.23.2, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.23.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6784,7 +6784,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.23.2
|
||||
cpuminer-opt config.status 3.23.3
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
127
cpu-miner.c
127
cpu-miner.c
@@ -1586,39 +1586,43 @@ start:
|
||||
else
|
||||
new_work = false;
|
||||
|
||||
if ( new_work && !opt_quiet )
|
||||
if ( new_work )
|
||||
{
|
||||
double miner_hr = 0.;
|
||||
double net_hr = net_hashrate;
|
||||
double nd = net_diff * exp32;
|
||||
char net_hr_units[4] = {0};
|
||||
char miner_hr_units[4] = {0};
|
||||
char net_ttf[32];
|
||||
char miner_ttf[32];
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
double miner_hr = 0.;
|
||||
double net_hr = net_hashrate;
|
||||
double nd = net_diff * exp32;
|
||||
char net_hr_units[4] = {0};
|
||||
char miner_hr_units[4] = {0};
|
||||
char net_ttf[32];
|
||||
char miner_ttf[32];
|
||||
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
pthread_mutex_lock( &stats_lock );
|
||||
|
||||
for ( int i = 0; i < opt_n_threads; i++ )
|
||||
miner_hr += thr_hashrates[i];
|
||||
global_hashrate = miner_hr;
|
||||
for ( int i = 0; i < opt_n_threads; i++ )
|
||||
miner_hr += thr_hashrates[i];
|
||||
global_hashrate = miner_hr;
|
||||
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( net_hr > 0. )
|
||||
sprintf_et( net_ttf, nd / net_hr );
|
||||
else
|
||||
sprintf( net_ttf, "NA" );
|
||||
if ( miner_hr > 0. )
|
||||
sprintf_et( miner_ttf, nd / miner_hr );
|
||||
else
|
||||
sprintf( miner_ttf, "NA" );
|
||||
if ( net_hr > 0. )
|
||||
sprintf_et( net_ttf, nd / net_hr );
|
||||
else
|
||||
sprintf( net_ttf, "NA" );
|
||||
if ( miner_hr > 0. )
|
||||
sprintf_et( miner_ttf, nd / miner_hr );
|
||||
else
|
||||
sprintf( miner_ttf, "NA" );
|
||||
|
||||
scale_hash_for_display ( &miner_hr, miner_hr_units );
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO,
|
||||
scale_hash_for_display ( &miner_hr, miner_hr_units );
|
||||
scale_hash_for_display ( &net_hr, net_hr_units );
|
||||
applog2( LOG_INFO,
|
||||
"Miner TTF @ %.2f %sh/s %s, Net TTF @ %.2f %sh/s %s",
|
||||
miner_hr, miner_hr_units, miner_ttf, net_hr,
|
||||
net_hr_units, net_ttf );
|
||||
}
|
||||
restart_threads();
|
||||
}
|
||||
} // rc
|
||||
|
||||
@@ -1872,10 +1876,14 @@ bool submit_solution( struct work *work, const void *hash,
|
||||
uint32_t* t = (uint32_t*)work->target;
|
||||
uint32_t* d = (uint32_t*)work->data;
|
||||
|
||||
applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
|
||||
applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x", d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
|
||||
applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
|
||||
applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x", t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
|
||||
applog( LOG_INFO, "Data[ 0: 9]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9] );
|
||||
applog( LOG_INFO, "Data[10:19]: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
d[10],d[11],d[12],d[13],d[14],d[15],d[16],d[17],d[18],d[19] );
|
||||
applog( LOG_INFO, "Hash[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
h[7],h[6],h[5],h[4],h[3],h[2],h[1],h[0] );
|
||||
applog( LOG_INFO, "Targ[ 7: 0]: %08x %08x %08x %08x %08x %08x %08x %08x",
|
||||
t[7],t[6],t[5],t[4],t[3],t[2],t[1],t[0] );
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@@ -2226,7 +2234,7 @@ static void *miner_thread( void *userdata )
|
||||
goto out;
|
||||
}
|
||||
g_work_time = time(NULL);
|
||||
restart_threads();
|
||||
// restart_threads();
|
||||
}
|
||||
|
||||
pthread_rwlock_unlock( &g_work_lock );
|
||||
@@ -2823,44 +2831,45 @@ static bool cpu_capability( bool display_only )
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
bool cpu_has_sse2 = has_sse2();
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_sse42 = has_sse42();
|
||||
bool cpu_has_avx = has_avx();
|
||||
bool cpu_has_avx2 = has_avx2();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool cpu_has_avx10 = has_avx10();
|
||||
bool sw_has_aes = false;
|
||||
bool cpu_has_aes = has_aes_ni();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool cpu_has_sha = has_sha();
|
||||
bool cpu_has_sha512 = has_sha512();
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
bool sw_has_avx512 = false;
|
||||
bool sw_has_sha = false;
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_vaes = false;
|
||||
bool sw_has_sha = false;
|
||||
bool sw_has_sha512 = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
bool use_avx;
|
||||
bool use_avx2;
|
||||
bool use_avx512;
|
||||
bool use_sha;
|
||||
bool use_aes;
|
||||
bool use_vaes;
|
||||
bool use_sha;
|
||||
bool use_sha512;
|
||||
bool use_none;
|
||||
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
@@ -2876,17 +2885,18 @@ static bool cpu_capability( bool display_only )
|
||||
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
|
||||
sw_has_avx512 = true;
|
||||
#endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __VAES__
|
||||
sw_has_vaes = true;
|
||||
#endif
|
||||
|
||||
|
||||
// #if !((__AES__) || (__SSE2__))
|
||||
// printf("Neither __AES__ nor __SSE2__ defined.\n");
|
||||
// #endif
|
||||
#ifdef __SHA__
|
||||
sw_has_sha = true;
|
||||
#endif
|
||||
#ifdef __SHA512__
|
||||
sw_has_sha512 = true;
|
||||
#endif
|
||||
|
||||
cpu_brand_string( cpu_brand );
|
||||
printf( "CPU: %s\n", cpu_brand );
|
||||
@@ -2909,7 +2919,8 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
else if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
|
||||
avx10_version(), avx10_vector_length() );
|
||||
|
||||
@@ -2921,7 +2932,8 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
if ( sw_has_sha512 ) printf( " SHA512" );
|
||||
else if ( sw_has_sha ) printf( " SHA" );
|
||||
|
||||
if ( !display_only )
|
||||
{
|
||||
@@ -2935,7 +2947,8 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||
if ( algo_has_vaes ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_has_sha512 ) printf( " SHA512" );
|
||||
else if ( algo_has_sha ) printf( " SHA" );
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
@@ -2970,16 +2983,17 @@ static bool cpu_capability( bool display_only )
|
||||
}
|
||||
|
||||
// Determine mining options
|
||||
use_sse2 = cpu_has_sse2 && algo_has_sse2;
|
||||
use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2;
|
||||
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
|
||||
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
|
||||
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
|
||||
use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
|
||||
|| use_avx2 || use_sha || use_vaes );
|
||||
|| use_avx2 || use_sha || use_vaes || use_sha512 );
|
||||
|
||||
// Display best options
|
||||
printf( "\nStarting miner with" );
|
||||
@@ -2993,7 +3007,8 @@ static bool cpu_capability( bool display_only )
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_vaes ) printf( " VAES" );
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
if ( use_sha512 ) printf( " SHA512" );
|
||||
else if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( "...\n\n" );
|
||||
|
||||
|
@@ -738,10 +738,10 @@ static inline void extr_lane_8x32( void *d, const void *s,
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
|
||||
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
|
||||
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
|
||||
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
|
||||
const __m256i c0 = v256_32( 0x00010203 );
|
||||
const __m256i c1 = v256_32( 0x04050607 );
|
||||
const __m256i c2 = v256_32( 0x08090a0b );
|
||||
const __m256i c3 = v256_32( 0x0c0d0e0f );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
@@ -796,7 +796,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i c1 = _mm256_set1_epi32( 1 );
|
||||
const __m256i c1 = v256_32( 1 );
|
||||
const __m256i c2 = _mm256_add_epi32( c1, c1 );
|
||||
const __m256i c3 = _mm256_add_epi32( c2, c1 );
|
||||
|
||||
@@ -1244,10 +1244,10 @@ static inline void extr_lane_16x32( void *d, const void *s,
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
const __m512i c0 = _mm512_set1_epi32( 0x00010203 );
|
||||
const __m512i c1 = _mm512_set1_epi32( 0x04050607 );
|
||||
const __m512i c2 = _mm512_set1_epi32( 0x08090a0b );
|
||||
const __m512i c3 = _mm512_set1_epi32( 0x0c0d0e0f );
|
||||
const __m512i c0 = v512_32( 0x00010203 );
|
||||
const __m512i c1 = v512_32( 0x04050607 );
|
||||
const __m512i c2 = v512_32( 0x08090a0b );
|
||||
const __m512i c3 = v512_32( 0x0c0d0e0f );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
@@ -1302,7 +1302,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi32( 1 );
|
||||
const __m512i c1 = v512_32( 1 );
|
||||
const __m512i c2 = _mm512_add_epi32( c1, c1 );
|
||||
const __m512i c3 = _mm512_add_epi32( c2, c1 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
@@ -1566,8 +1566,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
|
||||
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
|
||||
const __m256i c0 = v256_64( 0x0405060700010203 );
|
||||
const __m256i c1 = v256_64( 0x0c0d0e0f08090a0b );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
@@ -1958,16 +1958,16 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
__m512i *d = (__m512i*)dst;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
|
||||
d[0] = _mm512_set1_epi64( s[0] );
|
||||
d[1] = _mm512_set1_epi64( s[1] );
|
||||
d[2] = _mm512_set1_epi64( s[2] );
|
||||
d[3] = _mm512_set1_epi64( s[3] );
|
||||
d[4] = _mm512_set1_epi64( s[4] );
|
||||
d[5] = _mm512_set1_epi64( s[5] );
|
||||
d[6] = _mm512_set1_epi64( s[6] );
|
||||
d[7] = _mm512_set1_epi64( s[7] );
|
||||
d[8] = _mm512_set1_epi64( s[8] );
|
||||
d[9] = _mm512_set1_epi64( s[9] );
|
||||
d[0] = v512_64( s[0] );
|
||||
d[1] = v512_64( s[1] );
|
||||
d[2] = v512_64( s[2] );
|
||||
d[3] = v512_64( s[3] );
|
||||
d[4] = v512_64( s[4] );
|
||||
d[5] = v512_64( s[5] );
|
||||
d[6] = v512_64( s[6] );
|
||||
d[7] = v512_64( s[7] );
|
||||
d[8] = v512_64( s[8] );
|
||||
d[9] = v512_64( s[9] );
|
||||
}
|
||||
|
||||
// byte swap and broadcast to all lanes
|
||||
@@ -1977,8 +1977,8 @@ static inline void mm512_intrlv80_8x64( void *dst, const void *src )
|
||||
// Combine byte swap & broadcast in one permute
|
||||
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const __m512i c0 = _mm512_set1_epi64( 0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi64( 0x0c0d0e0f08090a0b );
|
||||
const __m512i c0 = v512_64( 0x0405060700010203 );
|
||||
const __m512i c1 = v512_64( 0x0c0d0e0f08090a0b );
|
||||
const __m128i s0 = casti_m128i( src,0 );
|
||||
const __m128i s1 = casti_m128i( src,1 );
|
||||
const __m128i s2 = casti_m128i( src,2 );
|
||||
@@ -2013,7 +2013,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m512i c1 = _mm512_set1_epi64( 1 );
|
||||
const __m512i c1 = v512_64( 1 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// 128 bit SSE vectors
|
||||
//
|
||||
@@ -20,9 +20,9 @@
|
||||
// define a local const for repeated references to the same constant.
|
||||
//
|
||||
// One common use for simd constants is as a control index for vector
|
||||
// instructions like blend and shuffle. Alhough the ultimate instruction
|
||||
// may execute in a single clock cycle, generating the control index adds
|
||||
// several more cycles to the entire operation.
|
||||
// shuffle instructions. Alhough the ultimate instruction may execute in a
|
||||
// single clock cycle, generating the control index adds several more cycles
|
||||
// to the entire operation.
|
||||
//
|
||||
// All of the utilities here assume all data is in registers except
|
||||
// in rare cases where arguments are pointers.
|
||||
@@ -32,7 +32,7 @@
|
||||
// Intrinsics automatically promote from REX to VEX when AVX is available
|
||||
// but ASM needs to be done manually.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
// Used instead if casting.
|
||||
@@ -43,8 +43,8 @@ typedef union
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
|
||||
|
||||
#define v128_64(i) _mm_set1_epi64x(i)
|
||||
#define v128_32(i) _mm_set1_epi32(i)
|
||||
#define v128_64(i64) _mm_set1_epi64x(i64)
|
||||
#define v128_32(i32) _mm_set1_epi32(i32)
|
||||
|
||||
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
|
||||
// that make these functions either unnecessary or inefficient.
|
||||
@@ -81,8 +81,6 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
// Pseudo constants
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
//#define m128_one_64 _mm_set1_epi64x( 1 )
|
||||
#define m128_one_32 _mm_set1_epi32( 1 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -100,7 +98,7 @@ static inline __m128i mm128_neg1_fn()
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
/////////////////////////////
|
||||
/////////////////////////////////////////////////////////////
|
||||
//
|
||||
// _mm_insert_ps( _mm128i v1, __m128i v2, imm8 c )
|
||||
//
|
||||
|
@@ -90,10 +90,7 @@ typedef union
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
//#define m256_one_256 mm256_mov64_256( 1 )
|
||||
#define m256_one_128 mm256_bcast_m128( m128_one_128 )
|
||||
#define m256_one_64 _mm256_set1_epi64x( 1 )
|
||||
#define m256_one_32 _mm256_set1_epi32( 1 )
|
||||
|
||||
static inline __m256i mm256_neg1_fn()
|
||||
{
|
||||
|
@@ -97,8 +97,8 @@ typedef union
|
||||
uint64_t u64[8];
|
||||
} __attribute__ ((aligned (64))) m512_ovly;
|
||||
|
||||
#define v512_64(i) _mm512_set1_epi64(i)
|
||||
#define v512_32(i) _mm512_set1_epi32(i)
|
||||
#define v512_64(i64) _mm512_set1_epi64(i64)
|
||||
#define v512_32(i32) _mm512_set1_epi32(i32)
|
||||
|
||||
// A simple 128 bit permute, using function instead of macro avoids
|
||||
// problems if the v arg passed as an expression.
|
||||
@@ -118,9 +118,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
|
||||
// Pseudo constants.
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
// Deprecated
|
||||
#define m512_one_64 _mm512_set1_epi64( 1 )
|
||||
#define m512_one_32 _mm512_set1_epi32( 1 )
|
||||
|
||||
// use asm to avoid compiler warning for unitialized local
|
||||
static inline __m512i mm512_neg1_fn()
|
||||
|
11
sysinfos.c
11
sysinfos.c
@@ -516,6 +516,17 @@ static inline bool has_sha()
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sha512()
|
||||
{
|
||||
#ifdef __arm__
|
||||
return false;
|
||||
#else
|
||||
unsigned int cpu_info[4] = { 0 };
|
||||
cpuid( EXTENDED_FEATURES, 1, cpu_info );
|
||||
return cpu_info[ EAX_Reg ] & SHA512_Flag;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool has_sse2()
|
||||
{
|
||||
#ifdef __arm__
|
||||
|
Reference in New Issue
Block a user