mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
bc5a5c6df8 | ||
![]() |
be88afc349 |
19
Makefile.am
19
Makefile.am
@@ -36,21 +36,17 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake256-hash-4way.c \
|
||||
algo/blake/blake512-hash-4way.c \
|
||||
algo/blake/blake256-hash.c \
|
||||
algo/blake/blake512-hash.c \
|
||||
algo/blake/blake-gate.c \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/sph-blake2s.c \
|
||||
algo/blake/blake2s-hash-4way.c \
|
||||
algo/blake/blake2s-hash.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blake2s-gate.c \
|
||||
algo/blake/blake2s-4way.c \
|
||||
algo/blake/blake2b-hash-4way.c \
|
||||
algo/blake/blake2b-hash.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2b-gate.c \
|
||||
algo/blake/blake2b-4way.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
@@ -163,8 +159,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha256-hash-opt.c \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
@@ -172,7 +166,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/sha/sha512256d-4way.c \
|
||||
@@ -294,10 +287,10 @@ disable_flags =
|
||||
if USE_ASM
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
if ARCH_x86
|
||||
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S
|
||||
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
|
||||
endif
|
||||
if ARCH_x86_64
|
||||
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S
|
||||
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
|
||||
endif
|
||||
if ARCH_ARM
|
||||
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
|
||||
|
@@ -65,6 +65,19 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.23.3
|
||||
|
||||
#400: Removed excessive thread restarts when mining solo.
|
||||
Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS.
|
||||
Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension.
|
||||
Added prototype of sha-512 using SHA512 intrinsics, untested.
|
||||
Other improvements and code cleanup.
|
||||
|
||||
v3.23.2
|
||||
|
||||
sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
|
||||
Other small improvements and code cleanup.
|
||||
|
||||
v3.23.1
|
||||
|
||||
#349: Fix sha256t low difficulty shares and low effective hash rate.
|
||||
|
@@ -248,7 +248,7 @@ int null_hash()
|
||||
return 0;
|
||||
};
|
||||
|
||||
void init_algo_gate( algo_gate_t* gate )
|
||||
static void init_algo_gate( algo_gate_t* gate )
|
||||
{
|
||||
gate->miner_thread_init = (void*)&return_true;
|
||||
gate->scanhash = (void*)&scanhash_generic;
|
||||
|
@@ -97,6 +97,7 @@ typedef uint32_t set_t;
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (deprecated)
|
||||
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake, Zen3
|
||||
#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
@@ -269,7 +270,7 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
// OpenSSL sha256 deprecated
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
bool std_le_work_decode( struct work *work );
|
||||
bool std_be_work_decode( struct work *work );
|
||||
|
@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "blake-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
#ifndef BLAKE_HASH_4WAY__
|
||||
#define BLAKE_HASH_4WAY__ 1
|
||||
#ifndef BLAKE256_HASH__
|
||||
#define BLAKE256_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
@@ -11,13 +11,6 @@
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2
|
||||
|
||||
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
@@ -86,30 +79,6 @@ void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
@@ -137,7 +106,6 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
@@ -150,33 +118,7 @@ void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
#endif // BLAKE256_HASH_H__
|
@@ -1,113 +0,0 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,20 +0,0 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,34 +0,0 @@
|
||||
#ifndef __BLAKE2B_GATE_H__
|
||||
#define __BLAKE2B_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2b_hash( void *state, const void *input );
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -31,7 +31,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -252,17 +252,17 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v512_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v512_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v512_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v512_64( 0x510E527FADE682D1 );
|
||||
v[13] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
v[12] = _mm512_xor_si512( v[12], v512_64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], v512_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v512_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v512_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v512_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v512_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], v512_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
@@ -419,17 +419,17 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v256_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v256_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v256_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v256_64( 0x510E527FADE682D1 );
|
||||
v[13] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
v[12] = _mm256_xor_si256( v[12], v256_64( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], v256_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm256_not( v[14] );
|
||||
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v256_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v256_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v256_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v256_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], v256_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
@@ -1,64 +1,175 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#define MIDLEN 76
|
||||
#define A 64
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint8_t _ALIGN(A) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(A) vhashcpu[8];
|
||||
uint32_t _ALIGN(A) endiandata[20];
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
{
|
||||
uint8_t _ALIGN(32) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (32)));
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
do {
|
||||
endiandata[19] = n;
|
||||
blake2b_hash( hash64, endiandata );
|
||||
if ( unlikely( valid_hash( hash64, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,170 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,23 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,46 +0,0 @@
|
||||
#ifndef __BLAKE2S_GATE_H__
|
||||
#define __BLAKE2S_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -11,7 +11,7 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include "blake2s-hash.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -62,23 +62,23 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
|
||||
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v128_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v128_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v128_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v128_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v128_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
|
||||
v128_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
|
||||
v128_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
|
||||
v128_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
|
||||
v128_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
@@ -269,35 +269,35 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
v[ 8] = v256_32( blake2s_IV[0] );
|
||||
v[ 9] = v256_32( blake2s_IV[1] );
|
||||
v[10] = v256_32( blake2s_IV[2] );
|
||||
v[11] = v256_32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
@@ -391,24 +391,24 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v256_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v256_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v256_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v256_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v256_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], v256_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( v512_32( S->t[0] ),
|
||||
v512_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm512_xor_si512( v512_32( S->t[1] ),
|
||||
v512_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm512_xor_si512( v512_32( S->f[0] ),
|
||||
v512_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm512_xor_si512( v512_32( S->f[1] ),
|
||||
v512_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
@@ -589,20 +589,20 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
|
||||
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v512_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v512_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v512_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v512_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], v512_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
@@ -1,75 +1,252 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "blake2s-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#elif defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static __thread blake2s_state blake2s_ctx;
|
||||
//static __thread blake2s_state s_ctx;
|
||||
#define MIDLEN 76
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (64)));
|
||||
|
||||
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
|
||||
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
|
||||
// blake2s_update(&ctx, input, 80);
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
/*
|
||||
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
|
||||
int scanhash_blake2s( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
s_ctx.buflen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(32) hash32[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
endiandata[19] = n;
|
||||
blake2s_hash( hash32, endiandata );
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,14 +1,12 @@
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake512-hash.h"
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const uint64_t IV512[8] =
|
||||
|
||||
static const uint64_t BLAKE512_IV[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
@@ -16,6 +14,7 @@ static const uint64_t IV512[8] =
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
/*
|
||||
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
@@ -36,8 +35,10 @@ static const unsigned sigma[16][16] = {
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
static const uint64_t CB[16] = {
|
||||
*/
|
||||
/*
|
||||
static const uint64_t CB[16] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x243F6A8885A308D3, 0x13198A2E03707344,
|
||||
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
|
||||
0x452821E638D01377, 0xBE5466CF34E90C6C,
|
||||
@@ -46,7 +47,7 @@ static const uint64_t CB[16] = {
|
||||
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
|
||||
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
|
||||
0x0801F2E2858EFC16, 0x636920D871574E69
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
#define Z00 0
|
||||
@@ -244,6 +245,290 @@ static const uint64_t CB[16] = {
|
||||
#define CBE 0x0801F2E2858EFC16
|
||||
#define CBF 0x636920D871574E69
|
||||
|
||||
// Blake-512 1 way AVX2 & SSE2
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define BLAKE512_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CBx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CBx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CBx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CBx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CBx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CBx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
|
||||
V0 = mm256_shufll_64( V0 ); \
|
||||
V3 = mm256_swap_128( V3 ); \
|
||||
V2 = mm256_shuflr_64( V2 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, D ) ^ Mx( r, C ), \
|
||||
CBx( r, B ) ^ Mx( r, A ), \
|
||||
CBx( r, 9 ) ^ Mx( r, 8 ), \
|
||||
CBx( r, F ) ^ Mx( r, E ) ) ) ); \
|
||||
V3 = mm256_swap64_32( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 25 ); \
|
||||
V0 = _mm256_add_epi64( V0, _mm256_add_epi64( V1, \
|
||||
_mm256_set_epi64x( CBx( r, C ) ^ Mx( r, D ), \
|
||||
CBx( r, A ) ^ Mx( r, B ), \
|
||||
CBx( r, 8 ) ^ Mx( r, 9 ), \
|
||||
CBx( r, E ) ^ Mx( r, F ) ) ) ); \
|
||||
V3 = mm256_shuflr64_16( _mm256_xor_si256( V3, V0 ) ); \
|
||||
V2 = _mm256_add_epi64( V2, V3 ); \
|
||||
V1 = mm256_ror_64( _mm256_xor_si256( V1, V2 ), 11 ); \
|
||||
V0 = mm256_shuflr_64( V0 ); \
|
||||
V3 = mm256_swap_128( V3 ); \
|
||||
V2 = mm256_shufll_64( V2 ); \
|
||||
}
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
||||
const uint64_t T1 )
|
||||
{
|
||||
__m256i V0, V1, V2, V3;
|
||||
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
|
||||
V0 = casti_m256i( H, 0 );
|
||||
V1 = casti_m256i( H, 1 );
|
||||
V2 = _mm256_set_epi64x( CB3, CB2, CB1, CB0 );
|
||||
V3 = _mm256_set_epi64x( T1 ^ CB7, T1 ^ CB6, T0 ^ CB5, T0 ^ CB4 );
|
||||
|
||||
M0 = bswap_64( buf[ 0] );
|
||||
M1 = bswap_64( buf[ 1] );
|
||||
M2 = bswap_64( buf[ 2] );
|
||||
M3 = bswap_64( buf[ 3] );
|
||||
M4 = bswap_64( buf[ 4] );
|
||||
M5 = bswap_64( buf[ 5] );
|
||||
M6 = bswap_64( buf[ 6] );
|
||||
M7 = bswap_64( buf[ 7] );
|
||||
M8 = bswap_64( buf[ 8] );
|
||||
M9 = bswap_64( buf[ 9] );
|
||||
MA = bswap_64( buf[10] );
|
||||
MB = bswap_64( buf[11] );
|
||||
MC = bswap_64( buf[12] );
|
||||
MD = bswap_64( buf[13] );
|
||||
ME = bswap_64( buf[14] );
|
||||
MF = bswap_64( buf[15] );
|
||||
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
BLAKE512_ROUND( 6 );
|
||||
BLAKE512_ROUND( 7 );
|
||||
BLAKE512_ROUND( 8 );
|
||||
BLAKE512_ROUND( 9 );
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
|
||||
casti_m256i( H, 0 ) = mm256_xor3( casti_m256i( H, 0 ), V0, V2 );
|
||||
casti_m256i( H, 1 ) = mm256_xor3( casti_m256i( H, 1 ), V1, V3 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
|
||||
}
|
||||
|
||||
#define BLAKE512_ROUND( R ) \
|
||||
{ \
|
||||
__m128i V32, V23, V67, V76; \
|
||||
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
|
||||
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
|
||||
V[2] = mm128_alignr_64( V32, V23, 1 ); \
|
||||
V[3] = mm128_alignr_64( V23, V32, 1 ); \
|
||||
V[6] = mm128_alignr_64( V76, V67, 1 ); \
|
||||
V[7] = mm128_alignr_64( V67, V76, 1 ); \
|
||||
}
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 )
|
||||
{
|
||||
__m128i V[8];
|
||||
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
|
||||
V[0] = casti_m128i( H, 0 );
|
||||
V[1] = casti_m128i( H, 1 );
|
||||
V[2] = casti_m128i( H, 2 );
|
||||
V[3] = casti_m128i( H, 3 );
|
||||
V[4] = _mm_set_epi64x( CB1, CB0 );
|
||||
V[5] = _mm_set_epi64x( CB3, CB2 );
|
||||
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
|
||||
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
|
||||
|
||||
M0 = bswap_64( buf[ 0] );
|
||||
M1 = bswap_64( buf[ 1] );
|
||||
M2 = bswap_64( buf[ 2] );
|
||||
M3 = bswap_64( buf[ 3] );
|
||||
M4 = bswap_64( buf[ 4] );
|
||||
M5 = bswap_64( buf[ 5] );
|
||||
M6 = bswap_64( buf[ 6] );
|
||||
M7 = bswap_64( buf[ 7] );
|
||||
M8 = bswap_64( buf[ 8] );
|
||||
M9 = bswap_64( buf[ 9] );
|
||||
MA = bswap_64( buf[10] );
|
||||
MB = bswap_64( buf[11] );
|
||||
MC = bswap_64( buf[12] );
|
||||
MD = bswap_64( buf[13] );
|
||||
ME = bswap_64( buf[14] );
|
||||
MF = bswap_64( buf[15] );
|
||||
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
BLAKE512_ROUND( 6 );
|
||||
BLAKE512_ROUND( 7 );
|
||||
BLAKE512_ROUND( 8 );
|
||||
BLAKE512_ROUND( 9 );
|
||||
BLAKE512_ROUND( 0 );
|
||||
BLAKE512_ROUND( 1 );
|
||||
BLAKE512_ROUND( 2 );
|
||||
BLAKE512_ROUND( 3 );
|
||||
BLAKE512_ROUND( 4 );
|
||||
BLAKE512_ROUND( 5 );
|
||||
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
|
||||
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
|
||||
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void blake512_init( blake512_context *sc )
|
||||
{
|
||||
memcpy( sc->H, BLAKE512_IV, 8 * sizeof(uint64_t) );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
void blake512_update(blake512_context *sc, const void *data, size_t len)
|
||||
{
|
||||
if ( len < (sizeof sc->buf) - sc->ptr )
|
||||
{
|
||||
memcpy( sc->buf + sc->ptr, data, len );
|
||||
sc->ptr += len;
|
||||
return;
|
||||
}
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - sc->ptr;
|
||||
if ( clen > len ) clen = len;
|
||||
memcpy( sc->buf + sc->ptr, data, clen );
|
||||
sc->ptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
len -= clen;
|
||||
if ( sc->ptr == sizeof sc->buf )
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 += 1;
|
||||
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blake512_close( blake512_context *sc, void *dst )
|
||||
{
|
||||
unsigned char buf[128] __attribute__((aligned(32)));
|
||||
size_t ptr, k;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr] = 0x80;
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFF;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00 + bit_len;
|
||||
sc->T1 -= 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
if ( bit_len <= 894 )
|
||||
{
|
||||
memset( buf + ptr + 1, 0, 111 - ptr );
|
||||
buf[111] |= 1;
|
||||
*((uint64_t*)(buf + 112)) = bswap_64( th );
|
||||
*((uint64_t*)(buf + 120)) = bswap_64( tl );
|
||||
blake512_update( sc, buf + ptr, 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset( buf + ptr + 1, 0, 127 - ptr );
|
||||
blake512_update( sc, buf + ptr, 128 - ptr );
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFF;
|
||||
memset( buf, 0, 112 );
|
||||
buf[111] = 1;
|
||||
*(uint64_t*)(buf + 112) = bswap_64( th );
|
||||
*(uint64_t*)(buf + 120) = bswap_64( tl );
|
||||
blake512_update( sc, buf, 128 );
|
||||
}
|
||||
|
||||
for ( k = 0; k < 8; k ++ )
|
||||
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
blake512_init( sc );
|
||||
blake512_update( sc, data, len );
|
||||
blake512_close( sc, dst );
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
@@ -277,12 +562,12 @@ static const uint64_t CB[16] = {
|
||||
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
v512_64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
|
||||
v512_64( c0 ), m1 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
@@ -318,14 +603,14 @@ static const uint64_t CB[16] = {
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_set1_epi64( CB0 ); \
|
||||
V9 = _mm512_set1_epi64( CB1 ); \
|
||||
VA = _mm512_set1_epi64( CB2 ); \
|
||||
VB = _mm512_set1_epi64( CB3 ); \
|
||||
VC = _mm512_set1_epi64( T0 ^ CB4 ); \
|
||||
VD = _mm512_set1_epi64( T0 ^ CB5 ); \
|
||||
VE = _mm512_set1_epi64( T1 ^ CB6 ); \
|
||||
VF = _mm512_set1_epi64( T1 ^ CB7 ); \
|
||||
V8 = v512_64( CB0 ); \
|
||||
V9 = v512_64( CB1 ); \
|
||||
VA = v512_64( CB2 ); \
|
||||
VB = v512_64( CB3 ); \
|
||||
VC = v512_64( T0 ^ CB4 ); \
|
||||
VD = v512_64( T0 ^ CB5 ); \
|
||||
VE = v512_64( T1 ^ CB6 ); \
|
||||
VF = v512_64( T1 ^ CB7 ); \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
@@ -385,14 +670,14 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( sc->T0 ^ CB4 );
|
||||
VD = v512_64( sc->T0 ^ CB5 );
|
||||
VE = v512_64( sc->T1 ^ CB6 );
|
||||
VF = v512_64( sc->T1 ^ CB7 );
|
||||
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
@@ -457,14 +742,14 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
|
||||
VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
|
||||
VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
|
||||
VF = _mm512_set1_epi64( sc->T1 ^ CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( sc->T0 ^ CB4 );
|
||||
VD = v512_64( sc->T0 ^ CB5 );
|
||||
VE = v512_64( sc->T1 ^ CB6 );
|
||||
VF = v512_64( sc->T1 ^ CB7 );
|
||||
|
||||
M0 = sc->buf[ 0];
|
||||
M1 = sc->buf[ 1];
|
||||
@@ -519,23 +804,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
|
||||
sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
sc->buf[10] = v512_64( 0x8000000000000000ULL );
|
||||
sc->buf[11] =
|
||||
sc->buf[12] = m512_zero;
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[13] = v512_64( 1 );
|
||||
sc->buf[14] = m512_zero;
|
||||
sc->buf[15] = _mm512_set1_epi64( 80*8 );
|
||||
sc->buf[15] = v512_64( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -546,14 +831,14 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_set1_epi64( CB0 );
|
||||
V9 = _mm512_set1_epi64( CB1 );
|
||||
VA = _mm512_set1_epi64( CB2 );
|
||||
VB = _mm512_set1_epi64( CB3 );
|
||||
VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
|
||||
VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
|
||||
VE = _mm512_set1_epi64( CB6 );
|
||||
VF = _mm512_set1_epi64( CB7 );
|
||||
V8 = v512_64( CB0 );
|
||||
V9 = v512_64( CB1 );
|
||||
VA = v512_64( CB2 );
|
||||
VB = v512_64( CB3 );
|
||||
VC = v512_64( CB4 ^ 0x280ULL );
|
||||
VD = v512_64( CB5 ^ 0x280ULL );
|
||||
VE = v512_64( CB6 );
|
||||
VF = v512_64( CB7 );
|
||||
|
||||
// round 0
|
||||
GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||
@@ -565,7 +850,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF );
|
||||
|
||||
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
v512_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
|
||||
VA = _mm512_add_epi64( VA, VF );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
|
||||
@@ -578,7 +863,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// round 1
|
||||
// G1
|
||||
// GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD);
|
||||
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( CB8 ),
|
||||
V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( CB8 ),
|
||||
sc->buf[ 4] ) );
|
||||
|
||||
// G2
|
||||
@@ -588,7 +873,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
// G3
|
||||
// GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF);
|
||||
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
v512_64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
|
||||
// save midstate for second part
|
||||
midstate[ 0] = V0;
|
||||
@@ -656,7 +941,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0 with the nonce now available
|
||||
V0 = _mm512_add_epi64( V0, _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB8 ), M9 ) );
|
||||
v512_64( CB8 ), M9 ) );
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 );
|
||||
VA = _mm512_add_epi64( VA, VF );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 );
|
||||
@@ -667,14 +952,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
// G1
|
||||
// GB_8WAY(Mx(1, 2), Mx(1, 3), CBx(1, 2), CBx(1, 3), V1, V5, V9, VD);
|
||||
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
|
||||
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( v512_64( c1 ), m0 );
|
||||
|
||||
V1 = _mm512_add_epi64( V1, V5 );
|
||||
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
|
||||
V9 = _mm512_add_epi64( V9, VD );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
|
||||
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
v512_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 16 );
|
||||
V9 = _mm512_add_epi64( V9, VD );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 11 );
|
||||
@@ -683,12 +968,12 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// GB_8WAY(Mx(1, 4), Mx(1, 5), CBx(1, 4), CBx(1, 5), V2, V6, VA, VE);
|
||||
// V2 = _mm512_add_epi64( V2, V6 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBF ), M9 ) );
|
||||
v512_64( CBF ), M9 ) );
|
||||
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
|
||||
VA = _mm512_add_epi64( VA, VE );
|
||||
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB9 ), MF ), V6 ) );
|
||||
v512_64( CB9 ), MF ), V6 ) );
|
||||
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 16 );
|
||||
VA = _mm512_add_epi64( VA, VE );
|
||||
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 11 );
|
||||
@@ -696,13 +981,13 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// G3
|
||||
// GB_8WAY(Mx(1, 6), Mx(1, 7), CBx(1, 6), CBx(1, 7), V3, V7, VB, VF);
|
||||
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||
// v512_64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
|
||||
VB = _mm512_add_epi64( VB, VF );
|
||||
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
|
||||
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
v512_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 16 );
|
||||
VB = _mm512_add_epi64( VB, VF );
|
||||
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 11 );
|
||||
@@ -744,14 +1029,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -815,7 +1100,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
|
||||
buf[ptr>>3] = v512_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -836,9 +1121,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||
_mm512_set1_epi64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
v512_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v512_64( bswap_64( th ) );
|
||||
buf[120>>3] = v512_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -850,9 +1135,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
buf[104>>3] = v512_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = v512_64( bswap_64( th ) );
|
||||
buf[120>>3] = v512_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
@@ -866,14 +1151,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -897,7 +1182,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
|
||||
sc->buf[ptr64] = v512_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -915,9 +1200,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
|
||||
sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
sc->buf[13] = v512_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = v512_64( bswap_64( th ) );
|
||||
sc->buf[15] = v512_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -933,14 +1218,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = v512_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = v512_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = v512_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -964,7 +1249,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
|
||||
sc->buf[ptr64] = v512_64( 0x8000000000000000ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
@@ -982,9 +1267,9 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m512_one_64;
|
||||
sc->buf[14] = _mm512_set1_epi64( th );
|
||||
sc->buf[15] = _mm512_set1_epi64( tl );
|
||||
sc->buf[13] = v512_64( 1 );
|
||||
sc->buf[14] = v512_64( th );
|
||||
sc->buf[15] = v512_64( tl );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -1013,12 +1298,12 @@ blake512_8way_close(void *cc, void *dst)
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
|
||||
{ \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
|
||||
v256_64( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
|
||||
v256_64( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
@@ -1054,14 +1339,14 @@ blake512_8way_close(void *cc, void *dst)
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_set1_epi64x( CB0 ); \
|
||||
V9 = _mm256_set1_epi64x( CB1 ); \
|
||||
VA = _mm256_set1_epi64x( CB2 ); \
|
||||
VB = _mm256_set1_epi64x( CB3 ); \
|
||||
VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
|
||||
VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
|
||||
VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
|
||||
VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
|
||||
V8 = v256_64( CB0 ); \
|
||||
V9 = v256_64( CB1 ); \
|
||||
VA = v256_64( CB2 ); \
|
||||
VB = v256_64( CB3 ); \
|
||||
VC = v256_64( T0 ^ CB4 ); \
|
||||
VD = v256_64( T0 ^ CB5 ); \
|
||||
VE = v256_64( T1 ^ CB6 ); \
|
||||
VF = v256_64( T1 ^ CB7 ); \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
@@ -1122,18 +1407,18 @@ void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
_mm256_set1_epi64x( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
_mm256_set1_epi64x( CB5 ) );
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
_mm256_set1_epi64x( CB6 ) );
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
_mm256_set1_epi64x( CB7 ) );
|
||||
V8 = v256_64( CB0 );
|
||||
V9 = v256_64( CB1 );
|
||||
VA = v256_64( CB2 );
|
||||
VB = v256_64( CB3 );
|
||||
VC = _mm256_xor_si256( v256_64( sc->T0 ),
|
||||
v256_64( CB4 ) );
|
||||
VD = _mm256_xor_si256( v256_64( sc->T0 ),
|
||||
v256_64( CB5 ) );
|
||||
VE = _mm256_xor_si256( v256_64( sc->T1 ),
|
||||
v256_64( CB6 ) );
|
||||
VF = _mm256_xor_si256( v256_64( sc->T1 ),
|
||||
v256_64( CB7 ) );
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
@@ -1188,23 +1473,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
|
||||
// initial hash
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
// fill buffer
|
||||
memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
|
||||
sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
|
||||
sc->buf[10] = v256_64( 0x8000000000000000ULL );
|
||||
sc->buf[11] = m256_zero;
|
||||
sc->buf[12] = m256_zero;
|
||||
sc->buf[13] = m256_one_64;
|
||||
sc->buf[13] = v256_64( 1 );
|
||||
sc->buf[14] = m256_zero;
|
||||
sc->buf[15] = _mm256_set1_epi64x( 80*8 );
|
||||
sc->buf[15] = v256_64( 80*8 );
|
||||
|
||||
// build working variables
|
||||
V0 = sc->H[0];
|
||||
@@ -1215,14 +1500,14 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_set1_epi64x( CB0 );
|
||||
V9 = _mm256_set1_epi64x( CB1 );
|
||||
VA = _mm256_set1_epi64x( CB2 );
|
||||
VB = _mm256_set1_epi64x( CB3 );
|
||||
VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
|
||||
VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
|
||||
VE = _mm256_set1_epi64x( CB6 );
|
||||
VF = _mm256_set1_epi64x( CB7 );
|
||||
V8 = v256_64( CB0 );
|
||||
V9 = v256_64( CB1 );
|
||||
VA = v256_64( CB2 );
|
||||
VB = v256_64( CB3 );
|
||||
VC = v256_64( CB4 ^ 0x280ULL );
|
||||
VD = v256_64( CB5 ^ 0x280ULL );
|
||||
VE = v256_64( CB6 );
|
||||
VF = v256_64( CB7 );
|
||||
|
||||
// round 0
|
||||
GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC );
|
||||
@@ -1232,7 +1517,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// G4 skip nonce
|
||||
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
v256_64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
|
||||
@@ -1244,7 +1529,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// round 1
|
||||
// G1
|
||||
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ),
|
||||
V1 = _mm256_add_epi64( V1, _mm256_xor_si256( v256_64( CB8 ),
|
||||
sc->buf[ 4] ) );
|
||||
|
||||
// G2
|
||||
@@ -1252,7 +1537,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
|
||||
// G3
|
||||
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB6 ), sc->buf[13] ), V7 ) );
|
||||
v256_64( CB6 ), sc->buf[13] ), V7 ) );
|
||||
|
||||
// save midstate for second part
|
||||
midstate[ 0] = V0;
|
||||
@@ -1319,7 +1604,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0, with the nonce now available
|
||||
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB8 ), M9 ) );
|
||||
v256_64( CB8 ), M9 ) );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
|
||||
@@ -1334,19 +1619,19 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
|
||||
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
v256_64( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
|
||||
|
||||
// G2
|
||||
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBF ), M9 ) );
|
||||
v256_64( CBF ), M9 ) );
|
||||
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
|
||||
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
|
||||
v256_64( CB9 ), MF ), V6 ) );
|
||||
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
|
||||
@@ -1356,7 +1641,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
|
||||
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
v256_64( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
|
||||
@@ -1398,14 +1683,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1465,7 +1750,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
|
||||
buf[ptr>>3] = v256_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -1487,9 +1772,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
v256_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -1501,9 +1786,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
buf[104>>3] = v256_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
@@ -1517,14 +1802,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
|
||||
// init
|
||||
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = v256_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = v256_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = v256_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
@@ -1548,7 +1833,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
|
||||
sc->buf[ptr64] = v256_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if ( sc->ptr == 0 )
|
||||
@@ -1565,9 +1850,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
sc->buf[13] = v256_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = v256_64( bswap_64( th ) );
|
||||
sc->buf[15] = v256_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
@@ -1589,4 +1874,4 @@ blake512_4way_close(void *cc, void *dst)
|
||||
blake64_4way_close( cc, dst );
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
83
algo/blake/blake512-hash.h
Normal file
83
algo/blake/blake512-hash.h
Normal file
@@ -0,0 +1,83 @@
|
||||
#ifndef BLAKE512_HASH__
|
||||
#define BLAKE512_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2 & AVX2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
uint64_t H[8];
|
||||
uint64_t T0, T1;
|
||||
size_t ptr;
|
||||
} blake512_context __attribute__ ((aligned (32)));
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
void blake512_init( blake512_context *sc );
|
||||
void blake512_update( blake512_context *sc, const void *data, size_t len );
|
||||
void blake512_close( blake512_context *sc, void *dst );
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE512_HASH_H__
|
@@ -1,5 +1,5 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
@@ -30,25 +30,25 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
block0_hash[0] = v512_32( phash[0] );
|
||||
block0_hash[1] = v512_32( phash[1] );
|
||||
block0_hash[2] = v512_32( phash[2] );
|
||||
block0_hash[3] = v512_32( phash[3] );
|
||||
block0_hash[4] = v512_32( phash[4] );
|
||||
block0_hash[5] = v512_32( phash[5] );
|
||||
block0_hash[6] = v512_32( phash[6] );
|
||||
block0_hash[7] = v512_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[0] = v512_32( pdata[16] );
|
||||
block_buf[1] = v512_32( pdata[17] );
|
||||
block_buf[2] = v512_32( pdata[18] );
|
||||
block_buf[3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
@@ -101,25 +101,25 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
block0_hash[0] = v256_32( phash[0] );
|
||||
block0_hash[1] = v256_32( phash[1] );
|
||||
block0_hash[2] = v256_32( phash[2] );
|
||||
block0_hash[3] = v256_32( phash[3] );
|
||||
block0_hash[4] = v256_32( phash[4] );
|
||||
block0_hash[5] = v256_32( phash[5] );
|
||||
block0_hash[6] = v256_32( phash[6] );
|
||||
block0_hash[7] = v256_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[0] = v256_32( pdata[16] );
|
||||
block_buf[1] = v256_32( pdata[17] );
|
||||
block_buf[2] = v256_32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
@@ -203,142 +203,3 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
//#if defined(BLAKECOIN_8WAY)
|
||||
|
||||
blake256r8_8way_context blakecoin_8w_ctx;
|
||||
|
||||
void blakecoin_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256r8_8way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = (uint32_t*)work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 8 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= HTarget )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -6,9 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
#include "sph_blake.h"
|
||||
#include "blake512-hash.h"
|
||||
|
||||
extern void pentablakehash_4way( void *output, const void *input )
|
||||
{
|
||||
|
@@ -14,8 +14,9 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
#include "compat/sph_types.h"
|
||||
#include "compat.h"
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
@@ -208,8 +209,8 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
|
||||
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
|
||||
{
|
||||
uint32_t m[16];
|
||||
uint32_t v[16];
|
||||
uint32_t _ALIGN(32) m[16];
|
||||
uint32_t _ALIGN(32) v[16];
|
||||
|
||||
for( size_t i = 0; i < 16; ++i )
|
||||
m[i] = load32( block + i * sizeof( m[i] ) );
|
||||
@@ -225,6 +226,58 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
__m128i *V = (__m128i*)v;
|
||||
|
||||
#define BLAKE2S_ROUND( r ) \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
|
||||
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
|
||||
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shufll_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shuflr_32( V[2] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
|
||||
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
|
||||
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shuflr_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shufll_32( V[2] )
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
BLAKE2S_ROUND(2);
|
||||
BLAKE2S_ROUND(3);
|
||||
BLAKE2S_ROUND(4);
|
||||
BLAKE2S_ROUND(5);
|
||||
BLAKE2S_ROUND(6);
|
||||
BLAKE2S_ROUND(7);
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
#undef BLAKE2S_ROUND
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||
@@ -236,6 +289,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
c = c + d; \
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND(r) \
|
||||
do { \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
@@ -247,7 +301,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
ROUND( 0 );
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
@@ -258,6 +313,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
ROUND( 8 );
|
||||
ROUND( 9 );
|
||||
|
||||
#endif
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
|
@@ -42,7 +42,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BLAKE-224.
|
||||
|
@@ -31,7 +31,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
#include "sph_blake2b.h"
|
||||
|
||||
// Little-endian byte access.
|
||||
|
@@ -41,8 +41,6 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
@@ -57,7 +55,7 @@ typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
size_t ptr;
|
||||
sph_u32 bit_count; // assume bit_count fits in 32 bits
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_4way_small_context;
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
@@ -144,7 +142,7 @@ typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
uint64_t bit_count;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
sc->bit_count += (sph_u32)len << 3;
|
||||
sc->bit_count += (uint32_t)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
|
@@ -45,15 +45,15 @@ extern "C"{
|
||||
|
||||
#define LPAR (
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
|
||||
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
|
||||
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
|
||||
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
|
||||
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
|
||||
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
|
||||
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
|
||||
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
|
||||
static const uint64_t IV512[] = {
|
||||
0x8081828384858687, 0x88898A8B8C8D8E8F,
|
||||
0x9091929394959697, 0x98999A9B9C9D9E9F,
|
||||
0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
|
||||
0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
|
||||
0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
|
||||
0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
|
||||
0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
|
||||
0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
|
||||
};
|
||||
|
||||
#if defined(__SSE2__)
|
||||
@@ -894,7 +894,7 @@ static const __m256i final_b[16] =
|
||||
};
|
||||
|
||||
static void
|
||||
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
|
||||
{
|
||||
sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
|
||||
sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
|
||||
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
sc->bit_count += (sph_u64)len << 3;
|
||||
sc->bit_count += (uint64_t)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
@@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] =
|
||||
|
||||
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx )
|
||||
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
|
||||
{
|
||||
ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
|
||||
ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-224.
|
||||
|
@@ -9,7 +9,6 @@
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include "cubehash_sse2.h"
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "compat.h"
|
||||
#include <stdint.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
|
||||
#define OPTIMIZE_SSE2
|
||||
|
||||
|
@@ -42,7 +42,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for CubeHash-224.
|
||||
|
@@ -22,7 +22,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "compat/sha3_common.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@@ -73,7 +73,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#define AES_BIG_ENDIAN 0
|
||||
#include "algo/sha/aes_helper.c"
|
||||
#include "compat/aes_helper.c"
|
||||
|
||||
#if SPH_ECHO_64
|
||||
|
||||
|
@@ -43,7 +43,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for ECHO-224.
|
||||
|
@@ -20,7 +20,7 @@
|
||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||
#endif
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "compat/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#define SPH_FUGUE_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for GOST-256.
|
||||
|
@@ -20,8 +20,8 @@
|
||||
#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
//#define NEED_UINT_64T
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
|
@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
|
@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -22,10 +22,6 @@
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#if defined(__VAES__)
|
||||
#include "groestl512-hash-4way.h"
|
||||
#endif
|
||||
|
@@ -40,7 +40,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
/**
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -36,44 +36,64 @@
|
||||
#define HAMSI_4WAY_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#define SPH_SIZE_hamsi512 512
|
||||
// Hamsi-512 4x64
|
||||
|
||||
// Partial is only scalar but needs pointer ref for hamsi-helper
|
||||
// deprecate partial_len
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i h[8];
|
||||
__m256i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_4way_big_context;
|
||||
|
||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
//#define hamsi512_4way hamsi512_4way_update
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_4x64_context hamsi512_4way_context
|
||||
#define hamsi512_4x64_init hamsi512_4way_init
|
||||
#define hamsi512_4x64_update hamsi512_4way_update
|
||||
#define hamsi512_4x64_close hamsi512_4way_close
|
||||
|
||||
// Hamsi-512 8x32
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i h[16];
|
||||
__m256i buf[2];
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_8x32_big_context;
|
||||
typedef hamsi_8x32_big_context hamsi512_8x32_context;
|
||||
|
||||
void hamsi512_8x32_init( hamsi512_8x32_context *sc );
|
||||
void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
|
||||
void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi-512 8x64
|
||||
|
||||
typedef struct {
|
||||
__m512i h[8];
|
||||
__m512i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_8way_big_context;
|
||||
|
||||
typedef hamsi_8way_big_context hamsi512_8way_context;
|
||||
|
||||
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
||||
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_8x64_context hamsi512_8way_context
|
||||
#define hamsi512_8x64_init hamsi512_8way_init
|
||||
#define hamsi512_8x64_update hamsi512_8way_update
|
||||
#define hamsi512_8x64_close hamsi512_8way_close
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
// Hamsi-512 16x32
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i h[16];
|
||||
__m512i buf[2];
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_16x32_big_context;
|
||||
typedef hamsi_16x32_big_context hamsi512_16x32_context;
|
||||
|
||||
void hamsi512_16x32_init( hamsi512_16x32_context *sc );
|
||||
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
|
||||
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#endif
|
||||
|
@@ -36,7 +36,7 @@
|
||||
#define SPH_HAMSI_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
115
algo/haval/haval-16way-helper.c
Normal file
115
algo/haval/haval-16way-helper.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
|
||||
/*
|
||||
* Helper code, included (three times !) by HAVAL implementation.
|
||||
*
|
||||
* TODO: try to merge this with md_helper.c.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_update)
|
||||
( haval_16way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
unsigned current;
|
||||
|
||||
current = (unsigned)sc->count_low & 127U;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (current>>2), vdata, clen>>2 );
|
||||
vdata += clen>>2;
|
||||
current += clen;
|
||||
len -= clen;
|
||||
if ( current == 128U )
|
||||
{
|
||||
DSTATE_16W;
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
RSTATE_16W;
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
WSTATE_16W;
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_close)( haval_16way_context *sc,
|
||||
void *dst)
|
||||
{
|
||||
unsigned current;
|
||||
DSTATE_16W;
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = v512_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_16W;
|
||||
if ( current > 116UL )
|
||||
{
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
current = 0;
|
||||
}
|
||||
|
||||
uint32_t t1, t2;
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = v512_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v512_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v512_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
WSTATE_16W;
|
||||
haval_16way_out( sc, dst );
|
||||
}
|
@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
sph_u32 clow, clow2;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = SPH_T32(clow + clen);
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m128_one_32;
|
||||
sc->buf[ current>>2 ] = v128_32( 1 );
|
||||
current += 4;
|
||||
RSTATE;
|
||||
if ( current > 116UL )
|
||||
|
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m256_one_32;
|
||||
sc->buf[ current>>2 ] = v256_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_8W;
|
||||
if ( current > 116UL )
|
||||
@@ -101,9 +101,9 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
|
||||
sc->buf[ 116>>2 ] = v256_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v256_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v256_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
|
@@ -189,7 +189,7 @@ do { \
|
||||
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
|
||||
mm128_ror_32( x7, 11 ) ), \
|
||||
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
|
||||
_mm_add_epi32( w, v128_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -292,7 +292,9 @@ static const unsigned MP5[32] = {
|
||||
2, 23, 16, 22, 4, 1, 25, 15
|
||||
};
|
||||
|
||||
static const sph_u32 RK2[32] = {
|
||||
#define SPH_C32(x) (x)
|
||||
|
||||
static const uint32_t RK2[32] = {
|
||||
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
|
||||
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
|
||||
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
|
||||
@@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = {
|
||||
SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
|
||||
};
|
||||
|
||||
static const sph_u32 RK3[32] = {
|
||||
static const uint32_t RK3[32] = {
|
||||
SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
|
||||
SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
|
||||
SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
|
||||
@@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = {
|
||||
SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
|
||||
};
|
||||
|
||||
static const sph_u32 RK4[32] = {
|
||||
static const uint32_t RK4[32] = {
|
||||
SPH_C32(0x7A325381), SPH_C32(0x28958677),
|
||||
SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
|
||||
SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
|
||||
@@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = {
|
||||
SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
|
||||
};
|
||||
|
||||
static const sph_u32 RK5[32] = {
|
||||
static const uint32_t RK5[32] = {
|
||||
SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
|
||||
SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
|
||||
SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
|
||||
@@ -469,14 +471,14 @@ do { \
|
||||
static void
|
||||
haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
|
||||
sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
|
||||
sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
|
||||
sc->s3 = _mm_set1_epi32( 0x03707344UL );
|
||||
sc->s4 = _mm_set1_epi32( 0xA4093822UL );
|
||||
sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
|
||||
sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
|
||||
sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
|
||||
sc->s0 = v128_32( 0x243F6A88UL );
|
||||
sc->s1 = v128_32( 0x85A308D3UL );
|
||||
sc->s2 = v128_32( 0x13198A2EUL );
|
||||
sc->s3 = v128_32( 0x03707344UL );
|
||||
sc->s4 = v128_32( 0xA4093822UL );
|
||||
sc->s5 = v128_32( 0x299F31D0UL );
|
||||
sc->s6 = v128_32( 0x082EFA98UL );
|
||||
sc->s7 = v128_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -660,7 +662,7 @@ do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
_mm256_add_epi32( w, v256_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -791,14 +793,14 @@ do { \
|
||||
static void
|
||||
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
|
||||
sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
|
||||
sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
|
||||
sc->s3 = _mm256_set1_epi32( 0x03707344UL );
|
||||
sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
|
||||
sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
|
||||
sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
|
||||
sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
|
||||
sc->s0 = v256_32( 0x243F6A88UL );
|
||||
sc->s1 = v256_32( 0x85A308D3UL );
|
||||
sc->s2 = v256_32( 0x13198A2EUL );
|
||||
sc->s3 = v256_32( 0x03707344UL );
|
||||
sc->s4 = v256_32( 0xA4093822UL );
|
||||
sc->s5 = v256_32( 0x299F31D0UL );
|
||||
sc->s6 = v256_32( 0x082EFA98UL );
|
||||
sc->s7 = v256_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -877,10 +879,300 @@ do { \
|
||||
|
||||
#define INMSG_8W(i) msg[i]
|
||||
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm512_andnotxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#define F1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, mm512_andxor( x1, x0, x4 ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) ) \
|
||||
|
||||
#define F2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( mm512_andxor( x2, _mm512_andnot_si512( x3, x1 ), \
|
||||
mm512_xor3( _mm512_and_si512( x4, x5 ), x6, x0 ) ), \
|
||||
mm512_andxor( x4, x1, x5 ), \
|
||||
mm512_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, \
|
||||
_mm512_and_si512( x3, \
|
||||
mm512_xor3( _mm512_and_si512( x1, x2 ), x6, x0 ) ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ) ) )
|
||||
|
||||
#define F4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( \
|
||||
mm512_andxor( x3, x5, \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x2 ), \
|
||||
_mm512_or_si512( x4, x6 ) ) ), \
|
||||
_mm512_and_si512( x4, \
|
||||
mm512_xor3( x0, _mm512_andnot_si512( x2, x5 ), \
|
||||
_mm512_xor_si512( x1, x6 ) ) ), \
|
||||
mm512_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm512_xor_si512( \
|
||||
mm512_andnotxor( mm512_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm512_xor3( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x1, x0, x3, x5, x6, x2, x4)
|
||||
#define FP3_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x4, x2, x1, x0, x5, x3, x6)
|
||||
#define FP3_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x6, x1, x2, x3, x4, x5, x0)
|
||||
|
||||
#define FP4_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x2, x6, x1, x4, x5, x3, x0)
|
||||
#define FP4_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x3, x5, x2, x0, x1, x6, x4)
|
||||
#define FP4_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x1, x4, x3, x6, x0, x2, x5)
|
||||
#define FP4_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x6, x4, x0, x5, x2, x1, x3)
|
||||
|
||||
#define FP5_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x3, x4, x1, x0, x5, x2, x6)
|
||||
#define FP5_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x6, x2, x1, x0, x3, x4, x5)
|
||||
#define FP5_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x2, x6, x0, x4, x3, x1, x5)
|
||||
#define FP5_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x1, x5, x3, x2, x0, x4, x6)
|
||||
#define FP5_5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F5_16W(x2, x5, x0, x6, x4, x3, x1)
|
||||
|
||||
#define STEP_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), \
|
||||
_mm512_add_epi32( w, v512_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_16W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP1_16W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1_16W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1_16W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1_16W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1_16W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1_16W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1_16W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1_16W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASSG_16W(p, n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_16W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(MP ## p[pass_count + 0]), \
|
||||
RK ## p[pass_count + 0]); \
|
||||
STEP_16W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(MP ## p[pass_count + 1]), \
|
||||
RK ## p[pass_count + 1]); \
|
||||
STEP_16W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(MP ## p[pass_count + 2]), \
|
||||
RK ## p[pass_count + 2]); \
|
||||
STEP_16W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(MP ## p[pass_count + 3]), \
|
||||
RK ## p[pass_count + 3]); \
|
||||
STEP_16W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(MP ## p[pass_count + 4]), \
|
||||
RK ## p[pass_count + 4]); \
|
||||
STEP_16W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(MP ## p[pass_count + 5]), \
|
||||
RK ## p[pass_count + 5]); \
|
||||
STEP_16W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(MP ## p[pass_count + 6]), \
|
||||
RK ## p[pass_count + 6]); \
|
||||
STEP_16W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(MP ## p[pass_count + 7]), \
|
||||
RK ## p[pass_count + 7]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASS2_16W(n, in) PASSG_16W(2, n, in)
|
||||
#define PASS3_16W(n, in) PASSG_16W(3, n, in)
|
||||
#define PASS4_16W(n, in) PASSG_16W(4, n, in)
|
||||
#define PASS5_16W(n, in) PASSG_16W(5, n, in)
|
||||
|
||||
#define SAVE_STATE_16W \
|
||||
__m512i u0, u1, u2, u3, u4, u5, u6, u7; \
|
||||
do { \
|
||||
u0 = s0; \
|
||||
u1 = s1; \
|
||||
u2 = s2; \
|
||||
u3 = s3; \
|
||||
u4 = s4; \
|
||||
u5 = s5; \
|
||||
u6 = s6; \
|
||||
u7 = s7; \
|
||||
} while (0)
|
||||
|
||||
#define UPDATE_STATE_16W \
|
||||
do { \
|
||||
s0 = _mm512_add_epi32( s0, u0 ); \
|
||||
s1 = _mm512_add_epi32( s1, u1 ); \
|
||||
s2 = _mm512_add_epi32( s2, u2 ); \
|
||||
s3 = _mm512_add_epi32( s3, u3 ); \
|
||||
s4 = _mm512_add_epi32( s4, u4 ); \
|
||||
s5 = _mm512_add_epi32( s5, u5 ); \
|
||||
s6 = _mm512_add_epi32( s6, u6 ); \
|
||||
s7 = _mm512_add_epi32( s7, u7 ); \
|
||||
} while (0)
|
||||
|
||||
#define CORE_16W5(in) do { \
|
||||
SAVE_STATE_16W; \
|
||||
PASS1_16W(5, in); \
|
||||
PASS2_16W(5, in); \
|
||||
PASS3_16W(5, in); \
|
||||
PASS4_16W(5, in); \
|
||||
PASS5_16W(5, in); \
|
||||
UPDATE_STATE_16W; \
|
||||
} while (0)
|
||||
|
||||
#define DSTATE_16W __m512i s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
#define RSTATE_16W \
|
||||
do { \
|
||||
s0 = sc->s0; \
|
||||
s1 = sc->s1; \
|
||||
s2 = sc->s2; \
|
||||
s3 = sc->s3; \
|
||||
s4 = sc->s4; \
|
||||
s5 = sc->s5; \
|
||||
s6 = sc->s6; \
|
||||
s7 = sc->s7; \
|
||||
} while (0)
|
||||
|
||||
#define WSTATE_16W \
|
||||
do { \
|
||||
sc->s0 = s0; \
|
||||
sc->s1 = s1; \
|
||||
sc->s2 = s2; \
|
||||
sc->s3 = s3; \
|
||||
sc->s4 = s4; \
|
||||
sc->s5 = s5; \
|
||||
sc->s6 = s6; \
|
||||
sc->s7 = s7; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
haval_16way_init( haval_16way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = v512_32( 0x243F6A88UL );
|
||||
sc->s1 = v512_32( 0x85A308D3UL );
|
||||
sc->s2 = v512_32( 0x13198A2EUL );
|
||||
sc->s3 = v512_32( 0x03707344UL );
|
||||
sc->s4 = v512_32( 0xA4093822UL );
|
||||
sc->s5 = v512_32( 0x299F31D0UL );
|
||||
sc->s6 = v512_32( 0x082EFA98UL );
|
||||
sc->s7 = v512_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
sc->count_low = 0;
|
||||
|
||||
}
|
||||
#define IN_PREPARE_16W(indata) const __m512i *const load_ptr_16w = (indata)
|
||||
|
||||
#define INW_16W(i) load_ptr_16w[ i ]
|
||||
|
||||
static void
|
||||
haval_16way_out( haval_16way_context *sc, void *dst )
|
||||
{
|
||||
__m512i *buf = (__m512i*)dst;
|
||||
DSTATE_16W;
|
||||
RSTATE_16W;
|
||||
|
||||
buf[0] = s0;
|
||||
buf[1] = s1;
|
||||
buf[2] = s2;
|
||||
buf[3] = s3;
|
||||
buf[4] = s4;
|
||||
buf[5] = s5;
|
||||
buf[6] = s6;
|
||||
buf[7] = s7;
|
||||
}
|
||||
|
||||
#undef PASSES
|
||||
#define PASSES 5
|
||||
#include "haval-16way-helper.c"
|
||||
|
||||
#define API_16W(xxx, y) \
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_init(void *cc) \
|
||||
{ \
|
||||
haval_16way_init(cc, xxx >> 5, y); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _16way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_close(void *cc, void *dst) \
|
||||
{ \
|
||||
haval ## y ## _16way_close(cc, dst); \
|
||||
} \
|
||||
|
||||
API_16W(256, 5)
|
||||
|
||||
#define RVAL_16W \
|
||||
do { \
|
||||
s0 = val[0]; \
|
||||
s1 = val[1]; \
|
||||
s2 = val[2]; \
|
||||
s3 = val[3]; \
|
||||
s4 = val[4]; \
|
||||
s5 = val[5]; \
|
||||
s6 = val[6]; \
|
||||
s7 = val[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WVAL_16W \
|
||||
do { \
|
||||
val[0] = s0; \
|
||||
val[1] = s1; \
|
||||
val[2] = s2; \
|
||||
val[3] = s3; \
|
||||
val[4] = s4; \
|
||||
val[5] = s5; \
|
||||
val[6] = s6; \
|
||||
val[7] = s7; \
|
||||
} while (0)
|
||||
|
||||
#define INMSG_16W(i) msg[i]
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -68,7 +68,6 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_haval256_5 256
|
||||
@@ -77,7 +76,7 @@ typedef struct {
|
||||
__m128i buf[32];
|
||||
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_4way_context;
|
||||
|
||||
typedef haval_4way_context haval256_5_4way_context;
|
||||
@@ -108,6 +107,25 @@ void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[32];
|
||||
__m512i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_16way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef haval_16way_context haval256_5_16way_context;
|
||||
|
||||
void haval256_5_16way_init( void *cc );
|
||||
|
||||
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_16way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -66,7 +66,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for HAVAL-128/3.
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(JHA_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for JH-224.
|
||||
|
@@ -2,7 +2,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
|
||||
bool register_keccak_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 128.0;
|
||||
#if defined (KECCAK_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_8way;
|
||||
|
@@ -1,45 +1,6 @@
|
||||
/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* Keccak interface. This is the interface for Keccak with the
|
||||
* recommended parameters for SHA-3, with output lengths 224, 256,
|
||||
* 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_keccak.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef KECCAK_HASH_4WAY_H__
|
||||
#define KECCAK_HASH_4WAY_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -2,7 +2,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Keccak-224.
|
||||
|
@@ -23,7 +23,6 @@
|
||||
#define LANE_H
|
||||
|
||||
#include <string.h>
|
||||
//#include "algo/sha/sha3-defs.h"
|
||||
#include <stdint.h>
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
|
@@ -7,8 +7,10 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define uint32 uint32_t
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(64))) = {
|
||||
static const uint32_t IV[40] __attribute((aligned(64))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
//#include "algo/sha/sha3-defs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
@@ -54,7 +54,7 @@
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*4];
|
||||
uint32_t buffer[8*4];
|
||||
__m512i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2];
|
||||
uint32_t buffer[8*2];
|
||||
__m256i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
|
@@ -33,8 +33,7 @@
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, \
|
||||
_mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
@@ -22,7 +22,7 @@
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Luffa-224.
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
|
@@ -21,9 +21,8 @@
|
||||
#define LYRA2_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
|
||||
//typedef unsigned char byte;
|
||||
typedef unsigned char byte;
|
||||
|
||||
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
|
||||
|
@@ -5,8 +5,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
//#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
__thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
|
@@ -4,7 +4,6 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
@@ -4,7 +4,6 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
//#include "lyra2.h"
|
||||
|
@@ -2,8 +2,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -98,7 +98,7 @@ do { \
|
||||
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_4W do { \
|
||||
a0 = _mm_xor_si128( g0, m128_one_32 ); \
|
||||
a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
|
||||
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
|
||||
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
|
||||
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
|
||||
@@ -268,7 +268,7 @@ panama_4way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m128i*)( sc->data + current ) = m128_one_32;
|
||||
*(__m128i*)( sc->data + current ) = v128_32( 1 );
|
||||
current++;
|
||||
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
|
||||
panama_4way_push( sc, sc->data, 1 );
|
||||
@@ -354,7 +354,7 @@ do { \
|
||||
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
a0 = _mm256_xor_si256( g0, v256_32( 1 ) ); \
|
||||
a1 = _mm256_xor_si256( g1, INW2( 0 ) ); \
|
||||
a2 = _mm256_xor_si256( g2, INW2( 1 ) ); \
|
||||
a3 = _mm256_xor_si256( g3, INW2( 2 ) ); \
|
||||
@@ -521,7 +521,7 @@ panama_8way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m256i*)( sc->data + current ) = m256_one_32;
|
||||
*(__m256i*)( sc->data + current ) = v256_32( 1 );
|
||||
current++;
|
||||
memset_zero_256( (__m256i*)sc->data + current, 32 - current );
|
||||
panama_8way_push( sc, sc->data, 1 );
|
||||
|
@@ -58,7 +58,7 @@
|
||||
#define SPH_PANAMA_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for PANAMA.
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "hmq1725-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -21,7 +21,7 @@
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
|
@@ -3,7 +3,8 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
#include "ripemd-hash-4way.h"
|
||||
|
||||
#define LBRY_INPUT_SIZE 112
|
||||
|
@@ -2,7 +2,6 @@
|
||||
#define RIPEMD_HASH_4WAY_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
|
||||
|
@@ -57,7 +57,7 @@
|
||||
#define SPH_RIPEMD_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RIPEMD.
|
||||
|
@@ -31,7 +31,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#include <mm_malloc.h>
|
||||
#include "malloc-huge.h"
|
||||
|
@@ -36,7 +36,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
typedef struct _hmac_sha256_4way_context
|
||||
{
|
||||
|
@@ -1,168 +0,0 @@
|
||||
/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* SHA-224, SHA-256, SHA-384 and SHA-512 interface.
|
||||
*
|
||||
* SHA-256 has been published in FIPS 180-2, now amended with a change
|
||||
* notice to include SHA-224 as well (which is a simple variation on
|
||||
* SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
|
||||
* standards can be found at:
|
||||
* http://csrc.nist.gov/publications/fips/
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_sha2.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SHA2_HASH_4WAY_H__
|
||||
#define SHA2_HASH_4WAY_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// SHA-256 4 way
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[128>>3];
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_4way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha512_4way_init( sha512_4way_context *sc);
|
||||
void sha512_4way_update( sha512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
void sha512_4way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha512_8way_init( sha512_8way_context *sc);
|
||||
void sha512_8way_update( sha512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_8way_close( sha512_8way_context *sc, void *dst );
|
||||
void sha512_8way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#endif // SHA256_4WAY_H__
|
@@ -1,689 +0,0 @@
|
||||
/* Intel SHA extensions using C intrinsics */
|
||||
/* Written and place in public domain by Jeffrey Walton */
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
|
||||
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
|
||||
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
|
||||
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
|
||||
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -3,16 +3,15 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
#include "compat.h"
|
||||
|
||||
/*
|
||||
static const uint32_t H256[8] =
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
|
||||
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m128i T1, T2; \
|
||||
__m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m128i K = v128_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
|
||||
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
|
||||
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[58] ),
|
||||
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
|
||||
B = _mm_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
|
||||
T1 = _mm_add_epi32( v128_32( K256[59] ),
|
||||
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
|
||||
A = _mm_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
|
||||
T2 = _mm_add_epi32( v128_32( K256[60] ),
|
||||
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
|
||||
H = _mm_add_epi32( H, T2 );
|
||||
|
||||
targ = _mm_set1_epi32( target[7] );
|
||||
targ = v128_32( target[7] );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
|
||||
|
||||
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
|
||||
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
|
||||
T0 = _mm_add_epi32( v128_32( K256[61] ),
|
||||
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
|
||||
G = _mm_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
|
||||
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
|
||||
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
|
||||
|
||||
if ( ( 0 != ( t6_mask & mm128_movmask_32(
|
||||
@@ -440,14 +439,14 @@ return 1;
|
||||
void sha256_4way_init( sha256_4way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v128_32( sha256_iv[0] );
|
||||
sc->val[1] = v128_32( sha256_iv[1] );
|
||||
sc->val[2] = v128_32( sha256_iv[2] );
|
||||
sc->val[3] = v128_32( sha256_iv[3] );
|
||||
sc->val[4] = v128_32( sha256_iv[4] );
|
||||
sc->val[5] = v128_32( sha256_iv[5] );
|
||||
sc->val[6] = v128_32( sha256_iv[6] );
|
||||
sc->val[7] = v128_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
|
||||
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
|
||||
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
|
||||
W[ i ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -610,6 +609,16 @@ do { \
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
|
||||
|
||||
// Not used with AVX512, needed to satisfy the compiler
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
@@ -621,9 +630,19 @@ do { \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
v256_32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
||||
@@ -635,7 +654,6 @@ do { \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
// read Y_xor_Z, update X_xor_Y
|
||||
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
@@ -647,7 +665,7 @@ do { \
|
||||
|
||||
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
|
||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
|
||||
W[ i0 ] ); \
|
||||
__m256i T1 = BSG2_1x( E ); \
|
||||
__m256i T2 = BSG2_0x( A ); \
|
||||
@@ -658,7 +676,7 @@ do { \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
\
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
|
||||
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
|
||||
W[ (i1) ] ); \
|
||||
T1 = BSG2_1x( D ); \
|
||||
T2 = BSG2_0x( H ); \
|
||||
@@ -769,9 +787,9 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
// round 3 part 1, ignore nonces W[3]
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
|
||||
_mm256_set1_epi32( K256[3] ) );
|
||||
v256_32( K256[3] ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
|
||||
MAJx(F, G, H) ) );
|
||||
@@ -807,23 +825,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
|
||||
#endif
|
||||
|
||||
// round 3 part 2, inject nonces
|
||||
// round 3 part 2, add nonces
|
||||
A = _mm256_add_epi32( A, W[3] );
|
||||
E = _mm256_add_epi32( E, W[3] );
|
||||
|
||||
// SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
@@ -865,6 +882,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
|
||||
@@ -887,18 +905,15 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
_mm256_store_si256( state_out + 7, H );
|
||||
}
|
||||
|
||||
|
||||
// It's working with a high hit rate but performance is lower
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
int flip;
|
||||
int t6_mask;
|
||||
__m256i vmask, targ, hash;
|
||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
uint8_t flip, t6_mask;
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
B = _mm256_load_si256( state_in+1 );
|
||||
@@ -912,14 +927,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i IV7 = H;
|
||||
const __m256i IV6 = G;
|
||||
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
for ( int j = 16; j < 48; j += 16 )
|
||||
{
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
|
||||
}
|
||||
// rounds 0 to 16, ignore zero padding W[9..14]
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// rounds 16 ro 31
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
|
||||
// rounds 32 to 47
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
// rounds 48 to 60 mexp
|
||||
W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
@@ -935,9 +973,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
// rounds 48 to 57
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
||||
@@ -950,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// round 58 to 60 part 1
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[58] ),
|
||||
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
|
||||
B = _mm256_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
|
||||
T1 = _mm256_add_epi32( v256_32( K256[59] ),
|
||||
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
|
||||
T2 = _mm256_add_epi32( v256_32( K256[60] ),
|
||||
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
|
||||
H = _mm256_add_epi32( H, T2 );
|
||||
|
||||
// Got H, test it.
|
||||
targ = _mm256_set1_epi32( target[7] );
|
||||
targ = v256_32( target[7] );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
if ( target[7] )
|
||||
{
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
||||
|
||||
// round 58 part 2
|
||||
@@ -977,34 +1018,35 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
MAJx( G, H, A ) ) );
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
|
||||
T0 = _mm256_add_epi32( v256_32( K256[61] ),
|
||||
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
|
||||
G = _mm256_add_epi32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
// Testing H inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
|
||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
||||
|
||||
if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
|
||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
||||
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0 != ( t6_mask & ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
|
||||
return 0;
|
||||
else if ( likely( target[6] == 0x80000000 ))
|
||||
{
|
||||
if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
|
||||
hash, _mm256_xor_si256( hash, hash ) ) ) ) )
|
||||
return 0;
|
||||
}
|
||||
if ( likely( ( target[6] == 0x80000000 )
|
||||
&& ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
|
||||
hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
|
||||
return 0;
|
||||
}
|
||||
// else inconclusive, testing targ5 isn't practical, fininsh hashing
|
||||
}
|
||||
|
||||
// At this point either the hash will be good or the test was inconclusive.
|
||||
// If the latter it's probably a high target difficulty with a nearly equal
|
||||
// high difficulty hash that has a good chance of being good.
|
||||
|
||||
// rounds 59 to 61 part 2
|
||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
|
||||
MAJx( F, G, H ) ) );
|
||||
@@ -1036,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8way_init( sha256_8way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v256_32( sha256_iv[0] );
|
||||
sc->val[1] = v256_32( sha256_iv[1] );
|
||||
sc->val[2] = v256_32( sha256_iv[2] );
|
||||
sc->val[3] = v256_32( sha256_iv[3] );
|
||||
sc->val[4] = v256_32( sha256_iv[4] );
|
||||
sc->val[5] = v256_32( sha256_iv[5] );
|
||||
sc->val[6] = v256_32( sha256_iv[6] );
|
||||
sc->val[7] = v256_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
// need to handle odd byte length for yespower.
|
||||
@@ -1089,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1105,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
@@ -1168,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
|
||||
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
|
||||
__m512i T1 = BSG2_1x16( E ); \
|
||||
__m512i T2 = BSG2_0x16( A ); \
|
||||
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
|
||||
@@ -1179,11 +1221,20 @@ do { \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
||||
v512_32( K256[(i)+(j)] ) ); \
|
||||
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
__m512i K = v512_32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
@@ -1292,9 +1343,9 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
// round 3 part 1, ignore nonces W[3]
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[3] ) );
|
||||
v512_32( K256[3] ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
|
||||
MAJx16(F, G, H) ) );
|
||||
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H, T1, T2;
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
memcpy_512( W, data, 16 );
|
||||
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
G = _mm512_load_si512( state_mid + 6 );
|
||||
H = _mm512_load_si512( state_mid + 7 );
|
||||
|
||||
// round 3 part 2, inject nonces
|
||||
// round 3 part 2, add nonces
|
||||
A = _mm512_add_epi32( A, W[3] );
|
||||
E = _mm512_add_epi32( E, W[3] );
|
||||
|
||||
// round 4
|
||||
SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
// rounds 4 to 15, ignore zero padding W[5..14]
|
||||
SHA256_16WAY_ROUND ( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_16WAY_ROUND ( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// round 5
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[5] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 6
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[6] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 7
|
||||
T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
|
||||
_mm512_set1_epi32( K256[7] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
|
||||
E = _mm512_add_epi32( E, T1 );
|
||||
A = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 8
|
||||
T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
|
||||
_mm512_set1_epi32( K256[8] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
|
||||
D = _mm512_add_epi32( D, T1 );
|
||||
H = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 9
|
||||
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
|
||||
_mm512_set1_epi32( K256[9] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
|
||||
C = _mm512_add_epi32( C, T1 );
|
||||
G = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 10
|
||||
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
|
||||
_mm512_set1_epi32( K256[10] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
|
||||
B = _mm512_add_epi32( B, T1 );
|
||||
F = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 11
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[11] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 12
|
||||
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
|
||||
_mm512_set1_epi32( K256[12] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
|
||||
H = _mm512_add_epi32( H, T1 );
|
||||
D = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 13
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[13] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 14
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[14] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 15
|
||||
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// rounds 16 to 31 mexp part 2, inject nonces.
|
||||
// rounds 16 to 31 mexp part 2, add nonces.
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
|
||||
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
|
||||
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
|
||||
// rounds 32 to 63
|
||||
W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
|
||||
// rounds 9 to 14, ignore zero padding
|
||||
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
|
||||
_mm512_set1_epi32( K256[9] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
|
||||
C = _mm512_add_epi32( C, T1 );
|
||||
G = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
|
||||
_mm512_set1_epi32( K256[10] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
|
||||
B = _mm512_add_epi32( B, T1 );
|
||||
F = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[11] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
|
||||
_mm512_set1_epi32( K256[12] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
|
||||
H = _mm512_add_epi32( H, T1 );
|
||||
D = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[13] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[14] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
|
||||
// round 15
|
||||
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// rounds 32 to 47
|
||||
SHA256_MEXP_16WAY_16ROUNDS( W );
|
||||
|
||||
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
// rounds 48 to 60 mexp
|
||||
@@ -1606,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
// rounds 58 to 60 part 1
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[58] ),
|
||||
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
|
||||
B = _mm512_add_epi32( B, T0 );
|
||||
|
||||
T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
|
||||
T1 = _mm512_add_epi32( v512_32( K256[59] ),
|
||||
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
|
||||
T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
|
||||
T2 = _mm512_add_epi32( v512_32( K256[60] ),
|
||||
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
|
||||
H = _mm512_add_epi32( H, T2 );
|
||||
|
||||
// got H, test it against target[7]
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[7] );
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
targ = v512_32( target[7] );
|
||||
if ( target[7] )
|
||||
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
|
||||
return 0;
|
||||
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
|
||||
@@ -1631,17 +1592,16 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
|
||||
T0 = _mm512_add_epi32( v512_32( K256[61] ),
|
||||
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
|
||||
G = _mm512_add_epi32( G, T0 );
|
||||
|
||||
// got G, test it against target[6] if indicated
|
||||
if ( t6_mask != 0 )
|
||||
if ( (uint16_t)t6_mask )
|
||||
{
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
|
||||
hash, targ ) ))
|
||||
targ = v512_32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1678,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
sc->val[0] = v512_32( sha256_iv[0] );
|
||||
sc->val[1] = v512_32( sha256_iv[1] );
|
||||
sc->val[2] = v512_32( sha256_iv[2] );
|
||||
sc->val[3] = v512_32( sha256_iv[3] );
|
||||
sc->val[4] = v512_32( sha256_iv[4] );
|
||||
sc->val[5] = v512_32( sha256_iv[5] );
|
||||
sc->val[6] = v512_32( sha256_iv[6] );
|
||||
sc->val[7] = v512_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
@@ -1729,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
|
||||
sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
@@ -1745,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
|
||||
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
|
@@ -1,388 +0,0 @@
|
||||
/* Intel SHA extensions using C intrinsics */
|
||||
/* Written and place in public domain by Jeffrey Walton */
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
// TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@@ -4,17 +4,18 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "cpuminer-config.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
|
||||
// generic interface
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
uint32_t state[8];
|
||||
uint64_t count;
|
||||
} sha256_context __attribute__((aligned(64)));
|
||||
|
||||
static const uint32_t SHA256_IV[8];
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len );
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||
void sha256_final( sha256_context *ctx, void *hash );
|
||||
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
|
||||
uint32_t *sstate, const uint32_t *istate );
|
||||
|
||||
void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
||||
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
||||
|
||||
// Select target
|
||||
// with SHA...
|
||||
#define sha256_transform_le sha256_opt_transform_le
|
||||
#define sha256_transform_be sha256_opt_transform_be
|
||||
|
||||
#else
|
||||
|
||||
// without SHA...
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#define sha256_transform_le sph_sha256_transform_le
|
||||
#define sha256_transform_be sph_sha256_transform_be
|
||||
|
||||
#endif
|
||||
|
||||
// SHA can't do only 3 rounds
|
||||
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_8way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// SHA-256 4 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_4way_context __attribute__ ((aligned (32)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#endif
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256_iv );
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 80*8; // bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 32*8; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
@@ -99,24 +106,22 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i block[16] __attribute__ ((aligned (128)));
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -125,59 +130,58 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
// vectorize IV for second hash
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
|
||||
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
@@ -284,75 +203,72 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i istate[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v256_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v256_32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
// initialize state for second hash
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
||||
istate, ptarget ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i istate[8] __attribute__ ((aligned (32)));
|
||||
__m128i mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -382,43 +298,40 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v128_32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
block[15] = v128_32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_transform_le( block, vdata+16, initstate );
|
||||
sha256_4way_transform_le( block, vdata+16, mstate );
|
||||
sha256_4way_transform_le( hash32, block, istate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256DT_16WAY 1
|
||||
@@ -22,14 +21,103 @@ static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
|
||||
0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
|
||||
};
|
||||
|
||||
#if defined(SHA256DT_SHA)
|
||||
|
||||
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 0x480; // funky bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 0x300; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256dt_iv, sha256dt_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_16WAY)
|
||||
|
||||
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i block[16] __attribute__ ((aligned (128)));
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
@@ -37,14 +125,12 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
// const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
@@ -53,52 +139,50 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_transform_le( phash, pdata, sha256dt_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
|
||||
buf[15] = v512_32( 0x480 ); // sha256dt funky bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
|
||||
// vectorize IV for second hash
|
||||
istate[0] = v512_32( sha256dt_iv[0] );
|
||||
istate[1] = v512_32( sha256dt_iv[1] );
|
||||
istate[2] = v512_32( sha256dt_iv[2] );
|
||||
istate[3] = v512_32( sha256dt_iv[3] );
|
||||
istate[4] = v512_32( sha256dt_iv[4] );
|
||||
istate[5] = v512_32( sha256dt_iv[5] );
|
||||
istate[6] = v512_32( sha256dt_iv[6] );
|
||||
istate[7] = v512_32( sha256dt_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
|
||||
block[15] = v512_32( 0x300 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// finish second block with nonces
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
@@ -118,86 +202,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_SHA)
|
||||
#endif
|
||||
|
||||
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 0x480; // funky bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 0x300; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256dt_iv, sha256dt_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_8WAY)
|
||||
#if defined(SHA256DT_8WAY)
|
||||
|
||||
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -218,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v256_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
block[15] = v256_32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
// initialize state for second hash
|
||||
istate[0] = v256_32( sha256dt_iv[0] );
|
||||
istate[1] = v256_32( sha256dt_iv[1] );
|
||||
istate[2] = v256_32( sha256dt_iv[2] );
|
||||
istate[3] = v256_32( sha256dt_iv[3] );
|
||||
istate[4] = v256_32( sha256dt_iv[4] );
|
||||
istate[5] = v256_32( sha256dt_iv[5] );
|
||||
istate[6] = v256_32( sha256dt_iv[6] );
|
||||
istate[7] = v256_32( sha256dt_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -253,11 +260,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
do
|
||||
{
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
mexp_pre );
|
||||
|
||||
if ( unlikely( sha256_8way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
||||
istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
@@ -279,7 +284,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_4WAY)
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_4WAY)
|
||||
|
||||
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -300,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 0x480 );
|
||||
vdata[16+15] = v128_32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 0x300 );
|
||||
block[15] = v128_32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
initstate[0] = v128_32( sha256dt_iv[0] );
|
||||
initstate[1] = v128_32( sha256dt_iv[1] );
|
||||
initstate[2] = v128_32( sha256dt_iv[2] );
|
||||
initstate[3] = v128_32( sha256dt_iv[3] );
|
||||
initstate[4] = v128_32( sha256dt_iv[4] );
|
||||
initstate[5] = v128_32( sha256dt_iv[5] );
|
||||
initstate[6] = v128_32( sha256dt_iv[6] );
|
||||
initstate[7] = v128_32( sha256dt_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
@@ -334,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
|
||||
// {
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
// }
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
|
@@ -4,7 +4,12 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
@@ -19,65 +24,60 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
static const uint32_t IV[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i last_byte = v512_32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, IV );
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
mstate1[0] = v512_32( phash[0] );
|
||||
mstate1[1] = v512_32( phash[1] );
|
||||
mstate1[2] = v512_32( phash[2] );
|
||||
mstate1[3] = v512_32( phash[3] );
|
||||
mstate1[4] = v512_32( phash[4] );
|
||||
mstate1[5] = v512_32( phash[5] );
|
||||
mstate1[6] = v512_32( phash[6] );
|
||||
mstate1[7] = v512_32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[0] = v512_32( pdata[16] );
|
||||
buf[1] = v512_32( pdata[17] );
|
||||
buf[2] = v512_32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( IV[0] );
|
||||
istate[1] = _mm512_set1_epi32( IV[1] );
|
||||
istate[2] = _mm512_set1_epi32( IV[2] );
|
||||
istate[3] = _mm512_set1_epi32( IV[3] );
|
||||
istate[4] = _mm512_set1_epi32( IV[4] );
|
||||
istate[5] = _mm512_set1_epi32( IV[5] );
|
||||
istate[6] = _mm512_set1_epi32( IV[6] );
|
||||
istate[7] = _mm512_set1_epi32( IV[7] );
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
istate[1] = v512_32( sha256_iv[1] );
|
||||
istate[2] = v512_32( sha256_iv[2] );
|
||||
istate[3] = v512_32( sha256_iv[3] );
|
||||
istate[4] = v512_32( sha256_iv[4] );
|
||||
istate[5] = v512_32( sha256_iv[5] );
|
||||
istate[6] = v512_32( sha256_iv[6] );
|
||||
istate[7] = v512_32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd & 3rd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v512_32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 0x480; // funky bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 80*8; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -131,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v256_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v256_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v256_32( sha256_iv[0] );
|
||||
istate[1] = v256_32( sha256_iv[1] );
|
||||
istate[2] = v256_32( sha256_iv[2] );
|
||||
istate[3] = v256_32( sha256_iv[3] );
|
||||
istate[4] = v256_32( sha256_iv[4] );
|
||||
istate[5] = v256_32( sha256_iv[5] );
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
@@ -222,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
const __m128i last_byte = v128_32( 0x80000000 );
|
||||
const __m128i four = v128_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
vdata[i] = v128_32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = v128_32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = v128_32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = v128_32( sha256_iv[0] );
|
||||
istate[1] = v128_32( sha256_iv[1] );
|
||||
istate[2] = v128_32( sha256_iv[2] );
|
||||
istate[3] = v128_32( sha256_iv[3] );
|
||||
istate[4] = v128_32( sha256_iv[4] );
|
||||
istate[5] = v128_32( sha256_iv[5] );
|
||||
istate[6] = v128_32( sha256_iv[6] );
|
||||
istate[7] = v128_32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||
#elif defined(__SHA__)
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_sha;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
#else
|
||||
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256q_16way;
|
||||
gate->hash = (void*)&sha256q_16way_hash;
|
||||
#elif defined(__SHA__)
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q;
|
||||
gate->hash = (void*)&sha256q_hash;
|
||||
|
@@ -6,6 +6,8 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256T_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256T_SHA 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256T_8WAY 1
|
||||
#else
|
||||
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(__SHA__)
|
||||
#if defined(SHA256T_SHA)
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
@@ -1,102 +0,0 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
//#include "algo/sha/sph_sha2.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
// Only used on CPUs with SHA
|
||||
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user