mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
3d1b6c87dc |
@@ -84,14 +84,10 @@ cpuminer_SOURCES = \
|
||||
algo/cubehash/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/echo-hash-4way.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/groestl-gate.c \
|
||||
algo/groestl/groestl512-hash-4way.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
algo/groestl/groestl-4way.c \
|
||||
algo/groestl/myrgr-gate.c \
|
||||
algo/groestl/myrgr-4way.c \
|
||||
algo/groestl/myr-groestl.c \
|
||||
@@ -129,7 +125,6 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/sponge-2way.c \
|
||||
algo/lyra2/lyra2-hash-2way.c \
|
||||
algo/lyra2/lyra2-gate.c \
|
||||
algo/lyra2/lyra2rev2.c \
|
||||
algo/lyra2/lyra2rev2-4way.c \
|
||||
@@ -191,7 +186,6 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/nist.c \
|
||||
|
@@ -126,11 +126,11 @@ Supported Algorithms
|
||||
x16rv2 Ravencoin (RVN)
|
||||
x16rt Gincoin (GIN)
|
||||
x16rt-veil Veil (VEIL)
|
||||
x16s Pigeoncoin (PGN)
|
||||
x16s
|
||||
x17
|
||||
x21s
|
||||
x22i
|
||||
x25x
|
||||
x21s Pigeoncoin (PGN)
|
||||
x22i
|
||||
x25x Sinovative (SIN)
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
|
@@ -1,17 +1,13 @@
|
||||
cpuminer-opt is a console program run from the command line using the
|
||||
keyboard, not the mouse.
|
||||
|
||||
See also README.md for list of supported algorithms,
|
||||
|
||||
Security warning
|
||||
----------------
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
usually a false positive, they are flagged simply because they are
|
||||
cryptocurrency miners. However, some malware has been spread using the
|
||||
cover that miners are known to be subject to false positives. Always be on
|
||||
alert. The source code of cpuminer-opt is open for anyone to inspect.
|
||||
If you don't trust the software don't download it.
|
||||
a false positive, they are flagged simply because they are cryptocurrency
|
||||
miners. The source code is open for anyone to inspect. If you don't trust
|
||||
the software, don't use it.
|
||||
|
||||
The cryptographic hashing code has been taken from trusted sources but has been
|
||||
modified for speed at the expense of accepted security practices. This
|
||||
@@ -21,7 +17,7 @@ required.
|
||||
Compile Instructions
|
||||
--------------------
|
||||
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
|
||||
See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@@ -35,38 +31,6 @@ not supported. FreeBSD YMMV.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.11.0
|
||||
|
||||
Fixed x25x AVX512 lane 4 invalid shares.
|
||||
|
||||
AVX512 for hex, phi2.
|
||||
|
||||
VAES optimzation for Intel Icelake CPUs for most algos recently optimized
|
||||
with AVX512, source code only.
|
||||
|
||||
v3.10.7
|
||||
|
||||
AVX512 for x25x, lbry, x13bcd (bcd).
|
||||
|
||||
v3.10.6
|
||||
|
||||
Added support for SSL stratum: stratum+tcps://
|
||||
|
||||
Added job id reporting again, but leaner, suppressed with --quiet.
|
||||
|
||||
AVX512 for x21s, x22i, lyra2z, allium.
|
||||
|
||||
Fixed share overflow warnings mining lbry with Ryzen (SHA).
|
||||
|
||||
v3.10.5
|
||||
|
||||
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.
|
||||
Faster hmq1725 AVX2.
|
||||
|
||||
v3.10.4
|
||||
|
||||
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
|
||||
|
||||
v3.10.3
|
||||
|
||||
AVX512 for x12, x13, x14, x15.
|
||||
|
@@ -317,7 +317,6 @@ const char* const algo_alias_map[][2] =
|
||||
{ "argon2d-crds", "argon2d250" },
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bcd", "x13bcd" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
|
@@ -104,7 +104,7 @@ typedef struct {
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
//#define blake256_8way blake256_8way_update
|
||||
#define blake256_8way blake256_8way_update
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
|
@@ -842,8 +842,7 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
size_t len )
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
|
||||
{
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
size_t bptr = ctx->ptr<<2;
|
||||
@@ -1238,7 +1237,7 @@ blake256_4way_init(void *ctx)
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way_update(void *ctx, const void *data, size_t len)
|
||||
blake256_4way(void *ctx, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(ctx, data, len);
|
||||
}
|
||||
|
@@ -463,38 +463,6 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Update and final when inlen is a multiple of 64 bytes
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen )
|
||||
{
|
||||
__m256i *in = (__m256i*)input;
|
||||
__m256i *buf = (__m256i*)S->buf;
|
||||
|
||||
while( inlen > BLAKE2S_BLOCKBYTES )
|
||||
{
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
inlen -= BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += BLAKE2S_BLOCKBYTES;
|
||||
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
|
||||
blake2s_8way_compress( S, buf );
|
||||
S->buflen = 0;
|
||||
in += ( BLAKE2S_BLOCKBYTES >> 2 );
|
||||
}
|
||||
|
||||
// last block
|
||||
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
|
||||
S->buflen = BLAKE2S_BLOCKBYTES;
|
||||
S->t[0] += S->buflen;
|
||||
S->t[1] += ( S->t[0] < S->buflen );
|
||||
if ( S->last_node ) S->f[1] = ~0U;
|
||||
S->f[0] = ~0U;
|
||||
blake2s_8way_compress( S, buf );
|
||||
|
||||
for ( int i = 0; i < 8; ++i )
|
||||
casti_m256i( out, i ) = S->h[ i ];
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
|
@@ -14,6 +14,7 @@
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
//#if defined(__SSE4_2__)
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
@@ -94,8 +95,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
|
||||
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
// const void *input, uint64_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -131,6 +132,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __SSE2__
|
||||
#endif // __SSE4_2__
|
||||
|
||||
#endif
|
||||
|
@@ -874,57 +874,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
|
||||
_mm256_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi32( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
|
||||
_mm256_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm256_add_epi32( _mm256_add_epi32( \
|
||||
mm256_rol_32( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
/*
|
||||
dH[ 0] = _mm256_add_epi32(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
|
||||
@@ -1005,7 +954,6 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
*/
|
||||
}
|
||||
|
||||
static const __m256i final_s8[16] =
|
||||
|
@@ -41,6 +41,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
@@ -65,7 +66,7 @@ void bmw512hash_4way(void *state, const void *input)
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
bmw512_4way_init( &ctx );
|
||||
bmw512_4way_update( &ctx, input, 80 );
|
||||
bmw512_4way( &ctx, input, 80 );
|
||||
bmw512_4way_close( &ctx, state );
|
||||
}
|
||||
|
||||
|
@@ -21,27 +21,7 @@ static void transform( cubehashParam *sp )
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
register __m512i x0, x1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->x );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
|
||||
x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
|
||||
x0 = _mm512_xor_si512( mm512_rol_32(
|
||||
mm512_swap256_128( x0 ), 11 ), x1 );
|
||||
x1 = mm512_swap64_32( x1 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->x, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->x + 1, x1 );
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
#ifdef __AVX2__
|
||||
|
||||
register __m256i x0, x1, x2, x3, y0, y1;
|
||||
|
||||
|
@@ -186,7 +186,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,13 +390,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -513,13 +513,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
|
@@ -1,620 +0,0 @@
|
||||
/*
|
||||
* file : echo_vperm.c
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* - vperm and aes_ni implementations of hash function ECHO
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _ECHO_VPERM_ must be defined if compiling with ../main.c
|
||||
* - define NO_AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <memory.h>
|
||||
#include "miner.h"
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
*/
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
|
||||
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
|
||||
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
|
||||
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
|
||||
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
|
||||
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
|
||||
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
|
||||
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
|
||||
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
|
||||
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
|
||||
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
|
||||
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
|
||||
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
|
||||
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
|
||||
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
|
||||
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
|
||||
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
|
||||
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
|
||||
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
|
||||
|
||||
|
||||
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
|
||||
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
|
||||
k1 = _mm_add_epi32(k1, M128(const1))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||
t1 = _mm_srli_epi16(state1[0][j], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = s2;\
|
||||
state2[1][j] = state1[0][j];\
|
||||
state2[2][j] = state1[0][j];\
|
||||
state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
|
||||
s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
|
||||
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m128i t1, t2, s2, k1;
|
||||
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < ctx->uHashSize / 256; j++)
|
||||
_state[i][j] = ctx->state[i][j];
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)_state;
|
||||
//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = _mm_setzero_si128();
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch(nHashSize)
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return BAD_HASHBITLEN;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < nHashSize / 256; j++)
|
||||
ctx->state[i][j] = ctx->hashsize;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
|
||||
{
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
|
||||
// Process buffer
|
||||
Compress(state, state->buffer, 1);
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if(uBlockCount > 0)
|
||||
{
|
||||
Compress(state, data, uBlockCount);
|
||||
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if(uRemainingBytes > 0)
|
||||
{
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
}
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
{
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
if((state->uBlockLength - state->uBufferBytes) >= 18)
|
||||
{
|
||||
// Pad with zeros
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Last block contains message bits?
|
||||
if(state->uBufferBytes == 1)
|
||||
{
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
|
||||
// Compress
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
|
||||
// Last block
|
||||
memset(state->buffer, 0, state->uBlockLength - 18);
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
/*
|
||||
if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
|
||||
{
|
||||
printf("full block\n");
|
||||
if( state->uBufferBytes != 0 )
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if( uBlockCount > 0 )
|
||||
{
|
||||
Compress( state, data, uBlockCount );
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
*/
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
|
||||
state->uBufferBytes += uByteLength;
|
||||
// }
|
||||
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
|
||||
// if( (state->uBlockLength - state->uBufferBytes) >= 18 )
|
||||
// {
|
||||
// Pad with zeros
|
||||
|
||||
memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
|
||||
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)&state->k;
|
||||
/*
|
||||
printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
b = (uint64_t*)state->buffer;
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
|
||||
b = (uint64_t*)state->state;
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
*/
|
||||
// Compress
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
/*
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
memset( state->buffer, 0, state->uBlockLength - 18 );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
|
||||
state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
*/
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
HashReturn hRet;
|
||||
hashState_echo hs;
|
||||
|
||||
/////
|
||||
/*
|
||||
__m128i a, b, c, d, t[4], u[4], v[4];
|
||||
|
||||
a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
|
||||
b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
|
||||
c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
|
||||
d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
|
||||
|
||||
t[0] = _mm_unpacklo_epi8(a, b);
|
||||
t[1] = _mm_unpackhi_epi8(a, b);
|
||||
t[2] = _mm_unpacklo_epi8(c, d);
|
||||
t[3] = _mm_unpackhi_epi8(c, d);
|
||||
|
||||
u[0] = _mm_unpacklo_epi16(t[0], t[2]);
|
||||
u[1] = _mm_unpackhi_epi16(t[0], t[2]);
|
||||
u[2] = _mm_unpacklo_epi16(t[1], t[3]);
|
||||
u[3] = _mm_unpackhi_epi16(t[1], t[3]);
|
||||
|
||||
|
||||
t[0] = _mm_unpacklo_epi16(u[0], u[1]);
|
||||
t[1] = _mm_unpackhi_epi16(u[0], u[1]);
|
||||
t[2] = _mm_unpacklo_epi16(u[2], u[3]);
|
||||
t[3] = _mm_unpackhi_epi16(u[2], u[3]);
|
||||
|
||||
u[0] = _mm_unpacklo_epi8(t[0], t[1]);
|
||||
u[1] = _mm_unpackhi_epi8(t[0], t[1]);
|
||||
u[2] = _mm_unpacklo_epi8(t[2], t[3]);
|
||||
u[3] = _mm_unpackhi_epi8(t[2], t[3]);
|
||||
|
||||
a = _mm_unpacklo_epi8(u[0], u[1]);
|
||||
b = _mm_unpackhi_epi8(u[0], u[1]);
|
||||
c = _mm_unpacklo_epi8(u[2], u[3]);
|
||||
d = _mm_unpackhi_epi8(u[2], u[3]);
|
||||
*/
|
||||
/////
|
||||
|
||||
hRet = init_echo(&hs, hashbitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = update_echo(&hs, data, databitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = final_echo(&hs, hashval);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,317 +0,0 @@
|
||||
//#if 0
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "echo-hash-4way.h"
|
||||
|
||||
/*
|
||||
static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
|
||||
0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
|
||||
};
|
||||
*/
|
||||
// do these need to be reversed?
|
||||
|
||||
#define mul2mask \
|
||||
_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
#define lsbmask m512_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
|
||||
k1 = _mm512_add_epi32( k1, m512_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
const int j1 = ( (j)+1 ) & 3; \
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask );\
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
// blockcount always 1
|
||||
void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m512i t1, t2, s2, k1;
|
||||
__m512i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
_state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
|
||||
_state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
|
||||
_state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
|
||||
_state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
|
||||
_state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
|
||||
_state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
|
||||
_state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
|
||||
_state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
|
||||
_state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
|
||||
_state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
|
||||
_state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
|
||||
_state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
|
||||
_state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
|
||||
_state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
|
||||
|
||||
for ( b = 0; b < uBlockCount; b++ )
|
||||
{
|
||||
ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
for( j = ctx->uHashSize / 256; j < 4; j++ )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ j ] = _mm512_load_si512(
|
||||
pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE( _statebackup, _state );
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for ( r = 0; r < ctx->uRounds / 2; r++ )
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
if ( ctx->uHashSize == 256 )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 2 ] ) ;
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
|
||||
_statebackup[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
const int vblen = state->uBlockLength / 16; // 16 bytes per lane
|
||||
__m512i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_zero;
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, state->processed_bits,
|
||||
0, state->processed_bits );
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_4way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,36 +0,0 @@
|
||||
#if !defined(ECHO_HASH_4WAY_H__)
|
||||
#define ECHO_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i state[4][4];
|
||||
__m512i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
|
||||
__m512i k;
|
||||
__m512i hashsize;
|
||||
__m512i const1536;
|
||||
|
||||
unsigned int uRounds;
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
int echo_4way_init( echo_4way_context *state, int hashbitlen );
|
||||
|
||||
|
||||
int echo_4way_update( echo_4way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
|
||||
int echo_close( echo_4way_context *state, void *hashval );
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -73,7 +73,7 @@ __m128i ALL_FF;
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
@@ -195,7 +195,7 @@ __m128i ALL_FF;
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
|
||||
@@ -209,6 +209,7 @@ __m128i ALL_FF;
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
|
||||
@@ -217,6 +218,7 @@ __m128i ALL_FF;
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
@@ -9,7 +9,6 @@
|
||||
|
||||
//#ifndef NO_AES_NI
|
||||
|
||||
// Not to be confused with AVX512VAES
|
||||
#define VAES
|
||||
// #define VAVX
|
||||
// #define VVPERM
|
||||
|
@@ -230,7 +230,6 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
@@ -1,64 +0,0 @@
|
||||
#include "groestl-gate.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(GROESTL_4WAY_VAES)
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
void groestl_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (128)));
|
||||
groestl512_4way_context ctx;
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, input, 640 );
|
||||
|
||||
groestl512_4way_init( &ctx, 64 );
|
||||
groestl512_4way_update_close( &ctx, hash, hash, 512 );
|
||||
|
||||
dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 4, n+1 );
|
||||
be32enc( noncep+ 8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
|
||||
groestl_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash+(lane<<3) )[7] < Htarg )
|
||||
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,23 +0,0 @@
|
||||
#include "groestl-gate.h"
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t *gate )
|
||||
{
|
||||
#if defined (GROESTL_4WAY_VAES)
|
||||
gate->scanhash = (void*)&scanhash_groestl_4way;
|
||||
gate->hash = (void*)&groestl_4way_hash;
|
||||
#else
|
||||
init_groestl_ctx();
|
||||
gate->scanhash = (void*)&scanhash_groestl;
|
||||
gate->hash = (void*)&groestlhash;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,31 +0,0 @@
|
||||
#ifndef GROESTL_GATE_H__
|
||||
#define GROESTL_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define GROESTL_4WAY_VAES 1
|
||||
#endif
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t* gate );
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(GROESTL_4WAY_VAES)
|
||||
|
||||
void groestl_4way_hash( void *state, const void *input );
|
||||
int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void groestlhash( void *state, const void *input );
|
||||
int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_groestl_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,5 @@
|
||||
#include "groestl-gate.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
@@ -77,12 +78,15 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
groestlhash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg )
|
||||
if ( fulltest(hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
if ( fulltest(hash, ptarget))
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
@@ -90,3 +94,20 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_dmd_gr_algo( algo_gate_t* gate )
|
||||
{
|
||||
init_groestl_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_groestl;
|
||||
gate->hash = (void*)&groestlhash;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,280 +0,0 @@
|
||||
/* hash.c Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <memory.h>
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
// Use this only for midstate and never for cryptonight
|
||||
HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
DataLength_gr databitlen )
|
||||
{
|
||||
__m128i* in = (__m128i*)input;
|
||||
const int len = (int)databitlen / 128; // bits to __m128i
|
||||
const int blocks = len / SIZE256; // __M128i to blocks
|
||||
int rem = ctx->rem_ptr;
|
||||
int i;
|
||||
|
||||
ctx->blk_count = blocks;
|
||||
ctx->databitlen = databitlen;
|
||||
|
||||
// digest any full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
// adjust buf_ptr to last block
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// Copy any remainder to buffer
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
// adjust rem_ptr for new data
|
||||
ctx->rem_ptr += i;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
// don't use this at all
|
||||
HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
// first pad byte = 0x80, last pad byte = block count
|
||||
// everything in between is zero
|
||||
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
// must be cryptonight, copy 64 bits of data
|
||||
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
|
||||
i = -1; // signal for odd length
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
||||
// adjust for final block
|
||||
blocks++;
|
||||
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( i == -1 )
|
||||
{
|
||||
// cryptonight odd length
|
||||
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
|
||||
// finish the block with zero and length padding as normal
|
||||
i = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
hashState_groestl256 context;
|
||||
|
||||
/* initialise */
|
||||
if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* process message */
|
||||
if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* finalise */
|
||||
ret = final_groestl256(&context, hashval);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* eBash API */
|
||||
//#ifdef crypto_hash_BYTES
|
||||
//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
|
||||
//{
|
||||
// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
|
||||
// return -1;
|
||||
//}
|
||||
//#endif
|
||||
|
||||
#endif
|
@@ -1,121 +0,0 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
/* eBash API begin */
|
||||
/*
|
||||
#include "crypto_hash.h"
|
||||
#ifdef crypto_hash_BYTES
|
||||
|
||||
#include <crypto_uint8.h>
|
||||
#include <crypto_uint32.h>
|
||||
#include <crypto_uint64.h>
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
#endif
|
||||
*/
|
||||
/* eBash API end */
|
||||
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
#endif
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
#define COLS512 (8)
|
||||
//#define COLS1024 (16)
|
||||
#define SIZE_512 ((ROWS)*(COLS512))
|
||||
//#define SIZE1024 ((ROWS)*(COLS1024))
|
||||
#define ROUNDS512 (10)
|
||||
//#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
#define COLS (COLS512)
|
||||
//#define SIZE (SIZE512)
|
||||
#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
//#define COLS (COLS1024)
|
||||
//#define SIZE (SIZE1024)
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif /* IS_BIG_ENDIAN */
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif /* IS_LITTLE_ENDIAN */
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum
|
||||
{
|
||||
SUCCESS_GR = 0,
|
||||
FAIL_GR = 1,
|
||||
BAD_HASHBITLEN_GR = 2
|
||||
} HashReturn_gr;
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
|
||||
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
|
||||
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
// u64 block_counter; /* message block counter */
|
||||
int hashlen; // bytes
|
||||
int blk_count;
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int rem_ptr;
|
||||
int databitlen;
|
||||
} hashState_groestl256;
|
||||
|
||||
HashReturn_gr init_groestl256( hashState_groestl256*, int );
|
||||
|
||||
HashReturn_gr reinit_groestl256( hashState_groestl256* );
|
||||
|
||||
HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
|
||||
DataLength_gr );
|
||||
|
||||
HashReturn_gr final_groestl256( hashState_groestl256*, void* );
|
||||
|
||||
HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
BitSequence_gr* );
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
|
||||
#endif /* __hash_h */
|
@@ -1,492 +0,0 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
//__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
//__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256( __m128i* chaining )
|
||||
{
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512( __m128i* chaining, __m128i* message )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512( __m128i* chaining )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
@@ -1,114 +0,0 @@
|
||||
/* hash.c Aug 2011
|
||||
* groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12.
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
// Optimized for hash and data length that are integrals of __m128i
|
||||
|
||||
|
||||
#include <memory.h>
|
||||
#include "groestl512-intr-4way.h"
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
#define ROTL64(a,n) \
|
||||
( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
|
||||
|
||||
#define U64BIG(a) \
|
||||
( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
|
||||
( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
|
||||
( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
|
||||
( ROTL64(a,56) & 0xFF000000FF000000 ) )
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_zero;
|
||||
}
|
||||
|
||||
uint64_t len = U64BIG((uint64_t)LENGTH);
|
||||
ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
|
||||
INIT_4way(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024_4way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
|
@@ -1,94 +0,0 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#if !defined(GROESTL512_HASH_4WAY_H__)
|
||||
#define GROESTL512_HASH_4WAY_H__ 1
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
//#define COLS512 (8)
|
||||
#define COLS1024 (16)
|
||||
//#define SIZE512 ((ROWS)*(COLS512))
|
||||
#define SIZE_1024 ((ROWS)*(COLS1024))
|
||||
//#define ROUNDS512 (10)
|
||||
#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
//#define COLS (COLS512)
|
||||
//#define SIZE (SIZE512)
|
||||
//#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
#define COLS (COLS1024)
|
||||
//#define SIZE (SIZE1024)
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
/*
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif // IS_BIG_ENDIAN
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif // IS_LITTLE_ENDIAN
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
|
||||
*/
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE512];
|
||||
int hashlen; // byte
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
int databitlen; // bits
|
||||
} groestl512_4way_context;
|
||||
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context*, uint64_t );
|
||||
|
||||
//int reinit_groestl( hashState_groestl* );
|
||||
|
||||
int groestl512_4way_update( groestl512_4way_context*, const void*,
|
||||
uint64_t );
|
||||
|
||||
int groestl512_4way_close( groestl512_4way_context*, void* );
|
||||
|
||||
int groestl512_4way_update_close( groestl512_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif /* __hash_h */
|
@@ -1,654 +0,0 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
|
||||
#if !defined(GROESTL512_INTR_4WAY_H__)
|
||||
#define GROESTL512_INTR_4WAY_H__ 1
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
/* global constants */
|
||||
__m512i ROUND_CONST_Lx;
|
||||
//__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
//__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m512i ROUND_CONST_P[ROUNDS1024];
|
||||
__m512i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m512i TRANSP_MASK;
|
||||
__m512i SUBSH_MASK[8];
|
||||
__m512i ALL_1B;
|
||||
__m512i ALL_FF;
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
// calculate the round constants seperately and load at startup
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm512_set1_epi32( 0xffffffff );\
|
||||
ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
|
||||
TRANSP_MASK = _mm512_set_epi32( \
|
||||
0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
|
||||
0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
|
||||
0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
|
||||
0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
|
||||
SUBSH_MASK[0] = _mm512_set_epi32( \
|
||||
0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
|
||||
0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
|
||||
0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
|
||||
0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
|
||||
SUBSH_MASK[1] = _mm512_set_epi32( \
|
||||
0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
|
||||
0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
|
||||
0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
|
||||
0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
|
||||
SUBSH_MASK[2] = _mm512_set_epi32( \
|
||||
0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
|
||||
0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
|
||||
0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
|
||||
0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
|
||||
SUBSH_MASK[3] = _mm512_set_epi32( \
|
||||
0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
|
||||
0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
|
||||
0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
|
||||
0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
|
||||
SUBSH_MASK[4] = _mm512_set_epi32( \
|
||||
0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
|
||||
0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
|
||||
0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
|
||||
0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
|
||||
SUBSH_MASK[5] = _mm512_set_epi32( \
|
||||
0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
|
||||
0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
|
||||
0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
|
||||
0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
|
||||
SUBSH_MASK[6] = _mm512_set_epi32( \
|
||||
0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
|
||||
0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
|
||||
0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
|
||||
0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
|
||||
SUBSH_MASK[7] = _mm512_set_epi32( \
|
||||
0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
|
||||
0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
|
||||
0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
|
||||
0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
|
||||
for( i = 0; i < ROUNDS1024; i++ ) \
|
||||
{ \
|
||||
ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
|
||||
0xb0a09080 ^ (i * 0x01010101), \
|
||||
0x70605040 ^ (i * 0x01010101), \
|
||||
0x30201000 ^ (i * 0x01010101) ); \
|
||||
ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
|
||||
0x4f5f6f7f ^ (i * 0x01010101), \
|
||||
0x8f9fafbf ^ (i * 0x01010101), \
|
||||
0xcfdfefff ^ (i * 0x01010101));\
|
||||
} \
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = _mm512_xor_si512( b0, b0 );\
|
||||
a0 = _mm512_aesenclast_epi128( a0, b0 );\
|
||||
a1 = _mm512_aesenclast_epi128( a1, b0 );\
|
||||
a2 = _mm512_aesenclast_epi128( a2, b0 );\
|
||||
a3 = _mm512_aesenclast_epi128( a3, b0 );\
|
||||
a4 = _mm512_aesenclast_epi128( a4, b0 );\
|
||||
a5 = _mm512_aesenclast_epi128( a5, b0 );\
|
||||
a6 = _mm512_aesenclast_epi128( a6, b0 );\
|
||||
a7 = _mm512_aesenclast_epi128( a7, b0 );\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
#define ROUNDS_P(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[0] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[1] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
#define ROUNDS_Q(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2) \
|
||||
{ \
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm512_xor_si512( xmm8, xmm1 );\
|
||||
xmm9 = _mm512_xor_si512( xmm9, xmm1 );\
|
||||
xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
|
||||
xmm11 = _mm512_xor_si512( xmm11, xmm1 );\
|
||||
xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[1] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[3] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
|
||||
xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
|
||||
xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
|
||||
xmm3 = _mm512_xor_si512( xmm3, xmm9 );\
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
/* Matrix Transpose
|
||||
* input is a 1024-bit state with two columns in one xmm
|
||||
* output is a 1024-bit state with two rows in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: i0-i7
|
||||
* clobbers: t0-t7
|
||||
*/
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i6 = _mm512_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm512_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm512_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm512_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm512_shuffle_epi8(i3, t0);\
|
||||
t1 = i2;\
|
||||
i4 = _mm512_shuffle_epi8(i4, t0);\
|
||||
i5 = _mm512_shuffle_epi8(i5, t0);\
|
||||
t2 = i4;\
|
||||
t3 = i6;\
|
||||
i7 = _mm512_shuffle_epi8(i7, t0);\
|
||||
\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t0 = i0;\
|
||||
t2 = _mm512_unpackhi_epi16(t2, i5);\
|
||||
i4 = _mm512_unpacklo_epi16(i4, i5);\
|
||||
t3 = _mm512_unpackhi_epi16(t3, i7);\
|
||||
i6 = _mm512_unpacklo_epi16(i6, i7);\
|
||||
t0 = _mm512_unpackhi_epi16(t0, i1);\
|
||||
t1 = _mm512_unpackhi_epi16(t1, i3);\
|
||||
i2 = _mm512_unpacklo_epi16(i2, i3);\
|
||||
i0 = _mm512_unpacklo_epi16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = _mm512_shuffle_epi32(t0, 216);\
|
||||
t1 = _mm512_shuffle_epi32(t1, 216);\
|
||||
t2 = _mm512_shuffle_epi32(t2, 216);\
|
||||
t3 = _mm512_shuffle_epi32(t3, 216);\
|
||||
i0 = _mm512_shuffle_epi32(i0, 216);\
|
||||
i2 = _mm512_shuffle_epi32(i2, 216);\
|
||||
i4 = _mm512_shuffle_epi32(i4, 216);\
|
||||
i6 = _mm512_shuffle_epi32(i6, 216);\
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = _mm512_unpacklo_epi32(i0, i2);\
|
||||
t4 = _mm512_unpackhi_epi32(t4, i2);\
|
||||
t5 = t0;\
|
||||
t0 = _mm512_unpacklo_epi32(t0, t1);\
|
||||
t5 = _mm512_unpackhi_epi32(t5, t1);\
|
||||
t6 = i4;\
|
||||
i4 = _mm512_unpacklo_epi32(i4, i6);\
|
||||
t7 = t2;\
|
||||
t6 = _mm512_unpackhi_epi32(t6, i6);\
|
||||
i2 = t0;\
|
||||
t2 = _mm512_unpacklo_epi32(t2, t3);\
|
||||
i3 = t0;\
|
||||
t7 = _mm512_unpackhi_epi32(t7, t3);\
|
||||
\
|
||||
/* there are now 2 rows in each xmm */\
|
||||
/* unpack to get 1 row of CV in each xmm */\
|
||||
i1 = i0;\
|
||||
i1 = _mm512_unpackhi_epi64(i1, i4);\
|
||||
i0 = _mm512_unpacklo_epi64(i0, i4);\
|
||||
i4 = t4;\
|
||||
i3 = _mm512_unpackhi_epi64(i3, t2);\
|
||||
i5 = t4;\
|
||||
i2 = _mm512_unpacklo_epi64(i2, t2);\
|
||||
i6 = t5;\
|
||||
i5 = _mm512_unpackhi_epi64(i5, t6);\
|
||||
i7 = t5;\
|
||||
i4 = _mm512_unpacklo_epi64(i4, t6);\
|
||||
i7 = _mm512_unpackhi_epi64(i7, t7);\
|
||||
i6 = _mm512_unpacklo_epi64(i6, t7);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse
|
||||
* input is a 1024-bit state with two rows in one xmm
|
||||
* output is a 1024-bit state with two columns in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
|
||||
* clobbers: t0-t4
|
||||
*/
|
||||
#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
|
||||
/* transpose matrix to get output format */\
|
||||
o1 = i0;\
|
||||
i0 = _mm512_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm512_unpackhi_epi64(o1, i1);\
|
||||
t0 = i2;\
|
||||
i2 = _mm512_unpacklo_epi64(i2, i3);\
|
||||
t0 = _mm512_unpackhi_epi64(t0, i3);\
|
||||
t1 = i4;\
|
||||
i4 = _mm512_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm512_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK;\
|
||||
i6 = _mm512_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm512_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
i0 = _mm512_shuffle_epi8(i0, o0);\
|
||||
i2 = _mm512_shuffle_epi8(i2, o0);\
|
||||
i4 = _mm512_shuffle_epi8(i4, o0);\
|
||||
i6 = _mm512_shuffle_epi8(i6, o0);\
|
||||
o1 = _mm512_shuffle_epi8(o1, o0);\
|
||||
t0 = _mm512_shuffle_epi8(t0, o0);\
|
||||
t1 = _mm512_shuffle_epi8(t1, o0);\
|
||||
t2 = _mm512_shuffle_epi8(t2, o0);\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t3 = i4;\
|
||||
o2 = o1;\
|
||||
o0 = i0;\
|
||||
t4 = t1;\
|
||||
\
|
||||
t3 = _mm512_unpackhi_epi16(t3, i6);\
|
||||
i4 = _mm512_unpacklo_epi16(i4, i6);\
|
||||
o0 = _mm512_unpackhi_epi16(o0, i2);\
|
||||
i0 = _mm512_unpacklo_epi16(i0, i2);\
|
||||
o2 = _mm512_unpackhi_epi16(o2, t0);\
|
||||
o1 = _mm512_unpacklo_epi16(o1, t0);\
|
||||
t4 = _mm512_unpackhi_epi16(t4, t2);\
|
||||
t1 = _mm512_unpacklo_epi16(t1, t2);\
|
||||
/* shuffle with immediate */\
|
||||
i4 = _mm512_shuffle_epi32(i4, 216);\
|
||||
t3 = _mm512_shuffle_epi32(t3, 216);\
|
||||
o1 = _mm512_shuffle_epi32(o1, 216);\
|
||||
o2 = _mm512_shuffle_epi32(o2, 216);\
|
||||
i0 = _mm512_shuffle_epi32(i0, 216);\
|
||||
o0 = _mm512_shuffle_epi32(o0, 216);\
|
||||
t1 = _mm512_shuffle_epi32(t1, 216);\
|
||||
t4 = _mm512_shuffle_epi32(t4, 216);\
|
||||
/* continue with unpack */\
|
||||
i1 = i0;\
|
||||
i3 = o0;\
|
||||
i5 = o1;\
|
||||
i7 = o2;\
|
||||
i0 = _mm512_unpacklo_epi32(i0, i4);\
|
||||
i1 = _mm512_unpackhi_epi32(i1, i4);\
|
||||
o0 = _mm512_unpacklo_epi32(o0, t3);\
|
||||
i3 = _mm512_unpackhi_epi32(i3, t3);\
|
||||
o1 = _mm512_unpacklo_epi32(o1, t1);\
|
||||
i5 = _mm512_unpackhi_epi32(i5, t1);\
|
||||
o2 = _mm512_unpacklo_epi32(o2, t4);\
|
||||
i7 = _mm512_unpackhi_epi32(i7, t4);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
|
||||
void TF1024_4way( __m512i* chaining, const __m512i* message )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i QTEMP[8];
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
xmm9 = message[1];
|
||||
xmm10 = message[2];
|
||||
xmm11 = message[3];
|
||||
xmm12 = message[4];
|
||||
xmm13 = message[5];
|
||||
xmm14 = message[6];
|
||||
xmm15 = message[7];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store message M (Q input) for later */
|
||||
QTEMP[0] = xmm8;
|
||||
QTEMP[1] = xmm9;
|
||||
QTEMP[2] = xmm10;
|
||||
QTEMP[3] = xmm11;
|
||||
QTEMP[4] = xmm12;
|
||||
QTEMP[5] = xmm13;
|
||||
QTEMP[6] = xmm14;
|
||||
QTEMP[7] = xmm15;
|
||||
|
||||
/* xor CV to message to get P input */
|
||||
/* result: CV+M in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV+M) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV+M)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* store P(CV+M)+CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
/* load message M (Q input) into xmm8-15 */
|
||||
xmm8 = QTEMP[0];
|
||||
xmm9 = QTEMP[1];
|
||||
xmm10 = QTEMP[2];
|
||||
xmm11 = QTEMP[3];
|
||||
xmm12 = QTEMP[4];
|
||||
xmm13 = QTEMP[5];
|
||||
xmm14 = QTEMP[6];
|
||||
xmm15 = QTEMP[7];
|
||||
|
||||
/* compute permutation Q */
|
||||
/* result: Q(M) in xmm8...xmm15 */
|
||||
ROUNDS_Q();
|
||||
|
||||
/* xor Q output */
|
||||
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF1024_4way( __m512i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm512_xor_si512( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
|
||||
|
||||
/* transpose CV back from row ordering to column ordering */
|
||||
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
|
||||
Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[4] = xmm0;
|
||||
chaining[5] = xmm6;
|
||||
chaining[6] = xmm13;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_INTR_4WAY_H__
|
@@ -1,159 +1,14 @@
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
#if defined(MYRGR_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#if defined(__VAES__)
|
||||
#include "groestl512-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(MYRGR_8WAY)
|
||||
|
||||
typedef struct {
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
sha256_8way_context sha;
|
||||
} myrgr_8way_ctx_holder;
|
||||
|
||||
myrgr_8way_ctx_holder myrgr_8way_ctx;
|
||||
|
||||
void init_myrgr_8way_ctx()
|
||||
{
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &myrgr_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
sha256_8way_init( &myrgr_8way_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[20*8] __attribute__ ((aligned (64)));
|
||||
myrgr_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[20] __attribute__ ((aligned (64)));
|
||||
|
||||
// rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
#else
|
||||
|
||||
uint32_t hash0[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[20] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[20] __attribute__ ((aligned (64)));
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
|
||||
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 8, n+1 );
|
||||
be32enc( noncep+16, n+2 );
|
||||
be32enc( noncep+24, n+3 );
|
||||
be32enc( noncep+32, n+4 );
|
||||
be32enc( noncep+40, n+5 );
|
||||
be32enc( noncep+48, n+6 );
|
||||
be32enc( noncep+64, n+7 );
|
||||
|
||||
myriad_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(MYRGR_4WAY)
|
||||
|
||||
typedef struct {
|
||||
hashState_groestl groestl;
|
||||
@@ -190,7 +45,7 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha256_4way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_4way( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
|
@@ -2,22 +2,16 @@
|
||||
|
||||
bool register_myriad_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (MYRGR_8WAY)
|
||||
init_myrgr_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad_8way;
|
||||
gate->hash = (void*)&myriad_8way_hash;
|
||||
gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT;
|
||||
#elif defined (MYRGR_4WAY)
|
||||
#if defined (MYRGR_4WAY)
|
||||
init_myrgr_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad_4way;
|
||||
gate->hash = (void*)&myriad_4way_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT;
|
||||
#else
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
||||
#endif
|
||||
gate->optimizations = AES_OPT | AVX2_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,35 +1,30 @@
|
||||
#ifndef MYRGR_GATE_H__
|
||||
#define MYRGR_GATE_H__ 1
|
||||
#define MYRGR_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define MYRGR_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||
#define MYRGR_4WAY 1
|
||||
#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||
#define MYRGR_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(MYRGR_8WAY)
|
||||
|
||||
void myriad_8way_hash( void *state, const void *input );
|
||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_myrgr_8way_ctx();
|
||||
|
||||
#elif defined(MYRGR_4WAY)
|
||||
#if defined(MYRGR_4WAY)
|
||||
|
||||
void myriad_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_myrgr_4way_ctx();
|
||||
|
||||
#else
|
||||
#endif
|
||||
|
||||
void myriad_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_myrgr_ctx();
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@@ -1171,8 +1171,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
|
||||
}
|
||||
|
||||
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
size_t len )
|
||||
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
|
||||
|
@@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
//#define hamsi512_4way hamsi512_4way_update
|
||||
#define hamsi512_4way hamsi512_4way_update
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
@@ -38,7 +38,7 @@
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
|
||||
( haval_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
|
@@ -1,115 +0,0 @@
|
||||
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
|
||||
/*
|
||||
* Helper code, included (three times !) by HAVAL implementation.
|
||||
*
|
||||
* TODO: try to merge this with md_helper.c.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
|
||||
( haval_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
unsigned current;
|
||||
|
||||
current = (unsigned)sc->count_low & 127U;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
|
||||
vdata += clen>>2;
|
||||
current += clen;
|
||||
len -= clen;
|
||||
if ( current == 128U )
|
||||
{
|
||||
DSTATE_8W;
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
RSTATE_8W;
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
WSTATE_8W;
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
void *dst)
|
||||
{
|
||||
unsigned current;
|
||||
DSTATE_8W;
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m256_one_32;
|
||||
current += 4;
|
||||
RSTATE_8W;
|
||||
if ( current > 116UL )
|
||||
{
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
} while (0);
|
||||
current = 0;
|
||||
}
|
||||
|
||||
uint32_t t1, t2;
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_8W(sc->buf);
|
||||
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
|
||||
} while (0);
|
||||
WSTATE_8W;
|
||||
haval_8way_out( sc, dst );
|
||||
}
|
@@ -40,7 +40,7 @@
|
||||
#include <string.h>
|
||||
#include "haval-hash-4way.h"
|
||||
|
||||
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
|
||||
// won't compile with sse4.2
|
||||
//#if defined (__SSE4_2__)
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
|
||||
haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _4way_update(cc, data, len); \
|
||||
haval ## y ## _4way(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
@@ -518,301 +518,6 @@ do { \
|
||||
|
||||
#define INMSG(i) msg[i]
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( x0, \
|
||||
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
|
||||
_mm256_and_si256( x3, x6 ) ) ) ) \
|
||||
|
||||
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x2, \
|
||||
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ) ), \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
|
||||
|
||||
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_xor_si256( x6, x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), x0 ) )
|
||||
|
||||
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x3, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
|
||||
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
|
||||
_mm256_and_si256( x4, \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
|
||||
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
|
||||
|
||||
|
||||
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_and_si256( x0, \
|
||||
mm256_not( _mm256_xor_si256( \
|
||||
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
|
||||
_mm256_and_si256( x2, x5 ) ), \
|
||||
_mm256_and_si256( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x1, x0, x3, x5, x6, x2, x4)
|
||||
#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x4, x2, x1, x0, x5, x3, x6)
|
||||
#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x6, x1, x2, x3, x4, x5, x0)
|
||||
|
||||
#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x2, x6, x1, x4, x5, x3, x0)
|
||||
#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x3, x5, x2, x0, x1, x6, x4)
|
||||
#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x1, x4, x3, x6, x0, x2, x5)
|
||||
#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_8W(x6, x4, x0, x5, x2, x1, x3)
|
||||
|
||||
#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_8W(x3, x4, x1, x0, x5, x2, x6)
|
||||
#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_8W(x6, x2, x1, x0, x3, x4, x5)
|
||||
#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_8W(x2, x6, x0, x4, x3, x1, x5)
|
||||
#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_8W(x1, x5, x3, x2, x0, x4, x6)
|
||||
#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F5_8W(x2, x5, x0, x6, x4, x3, x1)
|
||||
|
||||
#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
|
||||
do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_8W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6), SPH_C32(0x00000000)); \
|
||||
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7), SPH_C32(0x00000000)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASSG_8W(p, n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(MP ## p[pass_count + 0]), \
|
||||
RK ## p[pass_count + 0]); \
|
||||
STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(MP ## p[pass_count + 1]), \
|
||||
RK ## p[pass_count + 1]); \
|
||||
STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(MP ## p[pass_count + 2]), \
|
||||
RK ## p[pass_count + 2]); \
|
||||
STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(MP ## p[pass_count + 3]), \
|
||||
RK ## p[pass_count + 3]); \
|
||||
STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(MP ## p[pass_count + 4]), \
|
||||
RK ## p[pass_count + 4]); \
|
||||
STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(MP ## p[pass_count + 5]), \
|
||||
RK ## p[pass_count + 5]); \
|
||||
STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(MP ## p[pass_count + 6]), \
|
||||
RK ## p[pass_count + 6]); \
|
||||
STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(MP ## p[pass_count + 7]), \
|
||||
RK ## p[pass_count + 7]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASS2_8W(n, in) PASSG_8W(2, n, in)
|
||||
#define PASS3_8W(n, in) PASSG_8W(3, n, in)
|
||||
#define PASS4_8W(n, in) PASSG_8W(4, n, in)
|
||||
#define PASS5_8W(n, in) PASSG_8W(5, n, in)
|
||||
|
||||
#define SAVE_STATE_8W \
|
||||
__m256i u0, u1, u2, u3, u4, u5, u6, u7; \
|
||||
do { \
|
||||
u0 = s0; \
|
||||
u1 = s1; \
|
||||
u2 = s2; \
|
||||
u3 = s3; \
|
||||
u4 = s4; \
|
||||
u5 = s5; \
|
||||
u6 = s6; \
|
||||
u7 = s7; \
|
||||
} while (0)
|
||||
|
||||
#define UPDATE_STATE_8W \
|
||||
do { \
|
||||
s0 = _mm256_add_epi32( s0, u0 ); \
|
||||
s1 = _mm256_add_epi32( s1, u1 ); \
|
||||
s2 = _mm256_add_epi32( s2, u2 ); \
|
||||
s3 = _mm256_add_epi32( s3, u3 ); \
|
||||
s4 = _mm256_add_epi32( s4, u4 ); \
|
||||
s5 = _mm256_add_epi32( s5, u5 ); \
|
||||
s6 = _mm256_add_epi32( s6, u6 ); \
|
||||
s7 = _mm256_add_epi32( s7, u7 ); \
|
||||
} while (0)
|
||||
|
||||
#define CORE_8W5(in) do { \
|
||||
SAVE_STATE_8W; \
|
||||
PASS1_8W(5, in); \
|
||||
PASS2_8W(5, in); \
|
||||
PASS3_8W(5, in); \
|
||||
PASS4_8W(5, in); \
|
||||
PASS5_8W(5, in); \
|
||||
UPDATE_STATE_8W; \
|
||||
} while (0)
|
||||
|
||||
#define DSTATE_8W __m256i s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
#define RSTATE_8W \
|
||||
do { \
|
||||
s0 = sc->s0; \
|
||||
s1 = sc->s1; \
|
||||
s2 = sc->s2; \
|
||||
s3 = sc->s3; \
|
||||
s4 = sc->s4; \
|
||||
s5 = sc->s5; \
|
||||
s6 = sc->s6; \
|
||||
s7 = sc->s7; \
|
||||
} while (0)
|
||||
|
||||
#define WSTATE_8W \
|
||||
do { \
|
||||
sc->s0 = s0; \
|
||||
sc->s1 = s1; \
|
||||
sc->s2 = s2; \
|
||||
sc->s3 = s3; \
|
||||
sc->s4 = s4; \
|
||||
sc->s5 = s5; \
|
||||
sc->s6 = s6; \
|
||||
sc->s7 = s7; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = m256_const1_32( 0x243F6A88UL );
|
||||
sc->s1 = m256_const1_32( 0x85A308D3UL );
|
||||
sc->s2 = m256_const1_32( 0x13198A2EUL );
|
||||
sc->s3 = m256_const1_32( 0x03707344UL );
|
||||
sc->s4 = m256_const1_32( 0xA4093822UL );
|
||||
sc->s5 = m256_const1_32( 0x299F31D0UL );
|
||||
sc->s6 = m256_const1_32( 0x082EFA98UL );
|
||||
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
sc->count_low = 0;
|
||||
|
||||
}
|
||||
#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
|
||||
|
||||
#define INW_8W(i) load_ptr_8w[ i ]
|
||||
|
||||
static void
|
||||
haval_8way_out( haval_8way_context *sc, void *dst )
|
||||
{
|
||||
__m256i *buf = (__m256i*)dst;
|
||||
DSTATE_8W;
|
||||
RSTATE_8W;
|
||||
|
||||
buf[0] = s0;
|
||||
buf[1] = s1;
|
||||
buf[2] = s2;
|
||||
buf[3] = s3;
|
||||
buf[4] = s4;
|
||||
buf[5] = s5;
|
||||
buf[6] = s6;
|
||||
buf[7] = s7;
|
||||
}
|
||||
|
||||
#undef PASSES
|
||||
#define PASSES 5
|
||||
#include "haval-8way-helper.c"
|
||||
|
||||
#define API_8W(xxx, y) \
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_init(void *cc) \
|
||||
{ \
|
||||
haval_8way_init(cc, xxx >> 5, y); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _8way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
|
||||
{ \
|
||||
haval ## y ## _8way_close(cc, dst); \
|
||||
} \
|
||||
|
||||
API_8W(256, 5)
|
||||
|
||||
#define RVAL_8W \
|
||||
do { \
|
||||
s0 = val[0]; \
|
||||
s1 = val[1]; \
|
||||
s2 = val[2]; \
|
||||
s3 = val[3]; \
|
||||
s4 = val[4]; \
|
||||
s5 = val[5]; \
|
||||
s6 = val[6]; \
|
||||
s7 = val[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WVAL_8W \
|
||||
do { \
|
||||
val[0] = s0; \
|
||||
val[1] = s1; \
|
||||
val[2] = s2; \
|
||||
val[3] = s3; \
|
||||
val[4] = s4; \
|
||||
val[5] = s5; \
|
||||
val[6] = s6; \
|
||||
val[7] = s7; \
|
||||
} while (0)
|
||||
|
||||
#define INMSG_8W(i) msg[i]
|
||||
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -59,7 +59,7 @@
|
||||
*/
|
||||
|
||||
#ifndef HAVAL_HASH_4WAY_H__
|
||||
#define HAVAL_HASH_4WAY_H__ 1
|
||||
#define HAVAL_HASH_4WAY_H__
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -84,30 +84,10 @@ typedef haval_4way_context haval256_5_4way_context;
|
||||
|
||||
void haval256_5_4way_init( void *cc );
|
||||
|
||||
void haval256_5_4way_update( void *cc, const void *data, size_t len );
|
||||
//#define haval256_5_4way haval256_5_4way_update
|
||||
void haval256_5_4way( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_4way_close( void *cc, void *dst );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[32];
|
||||
__m256i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_8way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef haval_8way_context haval256_5_8way_context;
|
||||
|
||||
void haval256_5_8way_init( void *cc );
|
||||
|
||||
void haval256_5_8way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -103,12 +103,14 @@ typedef jh_4way_context jh512_4way_context;
|
||||
void jh256_4way_init( jh_4way_context *sc);
|
||||
|
||||
void jh256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh256_4way jh256_4way_update
|
||||
|
||||
void jh256_4way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_4way_init( jh_4way_context *sc );
|
||||
|
||||
void jh512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh512_4way jh512_4way_update
|
||||
|
||||
void jh512_4way_close(void *cc, void *dst);
|
||||
|
||||
|
@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
keccak512_4way_context ctx_keccak;
|
||||
|
||||
keccak512_4way_init( &ctx_keccak );
|
||||
keccak512_4way_update( &ctx_keccak, input, 80 );
|
||||
keccak512_4way( &ctx_keccak, input, 80 );
|
||||
keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
// Heavy & Light Pair Loop
|
||||
@@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way_update( &ctx_skein, vhash, 64 );
|
||||
skein512_4way( &ctx_skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx_skein, vhashB );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
@@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
blake512_4way_close( &ctx_blake, vhashA );
|
||||
|
||||
jh512_4way_init( &ctx_jh );
|
||||
jh512_4way_update( &ctx_jh, vhash, 64 );
|
||||
jh512_4way( &ctx_jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx_jh, vhashB );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
|
@@ -99,12 +99,14 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
|
||||
void keccak256_4way_init(void *cc);
|
||||
void keccak256_4way_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_4way_close(void *cc, void *dst);
|
||||
#define keccak256_4way keccak256_4way_update
|
||||
|
||||
void keccak512_4way_init(void *cc);
|
||||
void keccak512_4way_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_4way_close(void *cc, void *dst);
|
||||
void keccak512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
#define keccak512_4way keccak512_4way_update
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -1,178 +1,15 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
|
||||
#if defined (ALLIUM_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
|
||||
#if defined (ALLIUM_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
skein256_8way_context skein;
|
||||
hashState_groestl256 groestl;
|
||||
} allium_8way_ctx_holder;
|
||||
|
||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
|
||||
bool init_allium_8way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &allium_8way_ctx.keccak );
|
||||
cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_8way_ctx.skein );
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
|
||||
rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
|
||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 256 );
|
||||
|
||||
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
}
|
||||
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256_8way_init( &allium_8way_ctx.blake );
|
||||
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
|
||||
allium_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
|
||||
{
|
||||
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined (ALLIUM_4WAY)
|
||||
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
@@ -204,11 +41,11 @@ void allium_4way_hash( void *state, const void *input )
|
||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
|
||||
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash32 );
|
||||
|
||||
rintrlv_4x32_4x64( vhash64, vhash32, 256 );
|
||||
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
@@ -233,7 +70,7 @@ void allium_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way_update( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
@@ -78,7 +78,8 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev3;
|
||||
gate->hash = (void*)&lyra2rev3_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
// gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -94,14 +95,10 @@ bool lyra2rev2_thread_init()
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
#if defined (LYRA2REV2_8WAY)
|
||||
l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 ); // 2 way
|
||||
init_lyra2rev2_8way_ctx();;
|
||||
#elif defined (LYRA2REV2_4WAY)
|
||||
l2v2_wholeMatrix = _mm_malloc( size, 64 );
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
init_lyra2rev2_4way_ctx();;
|
||||
#else
|
||||
l2v2_wholeMatrix = _mm_malloc( size, 64 );
|
||||
init_lyra2rev2_ctx();
|
||||
#endif
|
||||
return l2v2_wholeMatrix;
|
||||
@@ -109,17 +106,14 @@ bool lyra2rev2_thread_init()
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (LYRA2REV2_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2_8way;
|
||||
gate->hash = (void*)&lyra2rev2_8way_hash;
|
||||
#elif defined (LYRA2REV2_4WAY)
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
|
||||
gate->hash = (void*)&lyra2rev2_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -129,11 +123,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||
gate->hash = (void*)&lyra2z_16way_hash;
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
gate->hash = (void*)&lyra2z_8way_hash;
|
||||
@@ -146,7 +136,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -174,11 +164,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_8WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||
gate->hash = (void*)&allium_8way_hash;
|
||||
#elif defined (ALLIUM_4WAY)
|
||||
#if defined (ALLIUM_4WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_4way;
|
||||
gate->hash = (void*)&allium_4way_hash;
|
||||
@@ -187,7 +173,7 @@ bool register_allium_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -229,7 +215,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
bool register_phi2_algo( algo_gate_t* gate )
|
||||
{
|
||||
// init_phi2_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
|
||||
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
|
||||
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
|
||||
gate->build_extraheader = (void*)&phi2_build_extraheader;
|
||||
|
@@ -5,10 +5,10 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2REV3_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
// #define LYRA2REV3_16WAY 1
|
||||
//#elif defined(__AVX2__)
|
||||
#if defined(__AVX2__)
|
||||
#define LYRA2REV3_8WAY 1
|
||||
#elif defined(__SSE2__)
|
||||
#define LYRA2REV3_4WAY 1
|
||||
@@ -50,24 +50,15 @@ bool init_lyra2rev3_ctx();
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2REV2_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2REV2_4WAY 1
|
||||
#if defined(__AVX2__)
|
||||
#define LYRA2REV2_4WAY
|
||||
#endif
|
||||
|
||||
extern __thread uint64_t* l2v2_wholeMatrix;
|
||||
|
||||
bool register_lyra2rev2_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(LYRA2REV2_8WAY)
|
||||
|
||||
void lyra2rev2_8way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev2_8way_ctx();
|
||||
|
||||
#elif defined(LYRA2REV2_4WAY)
|
||||
#if defined(LYRA2REV2_4WAY)
|
||||
|
||||
void lyra2rev2_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -85,25 +76,17 @@ bool init_lyra2rev2_ctx();
|
||||
|
||||
/////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2Z_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY 1
|
||||
#elif defined(__SSE2__)
|
||||
#define LYRA2Z_4WAY 1
|
||||
#if defined(__SSE2__)
|
||||
#define LYRA2Z_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY
|
||||
#endif
|
||||
|
||||
|
||||
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
void lyra2z_16way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_16way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -152,22 +135,13 @@ bool lyra2h_thread_init();
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define ALLIUM_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_4WAY 1
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_4WAY
|
||||
#endif
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(ALLIUM_8WAY)
|
||||
|
||||
void allium_8way_hash( void *state, const void *input );
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_8way_ctx();
|
||||
|
||||
#elif defined(ALLIUM_4WAY)
|
||||
#if defined(ALLIUM_4WAY)
|
||||
|
||||
void allium_4way_hash( void *state, const void *input );
|
||||
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -1,578 +0,0 @@
|
||||
/**
|
||||
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "compat.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
|
||||
// LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x,
|
||||
// dynamic matrix allocation.
|
||||
//
|
||||
// LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
|
||||
// allocation.
|
||||
//
|
||||
// LYRA2REV3 4 cols 4 rows with an extra twist in calculating
|
||||
// rowa in the wandering phase. Used by lyra2rev3. Static matrix
|
||||
// allocation.
|
||||
//
|
||||
// LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
|
||||
// lyra2z330, lyra2h,
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
|
||||
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
|
||||
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
|
||||
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
|
||||
*
|
||||
* @param K The derived key to be output by the algorithm
|
||||
* @param kLen Desired key length
|
||||
* @param pwd User password
|
||||
* @param pwdlen Password length
|
||||
* @param salt Salt
|
||||
* @param saltlen Salt length
|
||||
* @param timeCost Parameter to determine the processing time (T)
|
||||
* @param nRows Number or rows of the memory matrix (R)
|
||||
* @param nCols Number of columns of the memory matrix (C)
|
||||
*
|
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||
*/
|
||||
|
||||
// For lyra2rev3.
|
||||
// convert a simple offset to an index into 2x4 u64 interleaved data.
|
||||
// good for state and 4 row matrix.
|
||||
// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
|
||||
|
||||
#define offset_to_index( o ) \
|
||||
( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
|
||||
|
||||
|
||||
int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||
const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[32];
|
||||
int64_t row = 2;
|
||||
int64_t prev = 1;
|
||||
int64_t rowa0 = 0;
|
||||
int64_t rowa1 = 0;
|
||||
int64_t tau;
|
||||
int64_t step = 1;
|
||||
int64_t window = 2;
|
||||
int64_t gap = 1;
|
||||
//====================================================================/
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
uint64_t *ptr = wholeMatrix;
|
||||
uint64_t *pw = (uint64_t*)pwd;
|
||||
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||
ptr += pwdlen>>2;
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||
ptr += pwdlen>>2;
|
||||
|
||||
// now build the rest interleaving on the fly.
|
||||
|
||||
ptr[0] = ptr[ 4] = kLen;
|
||||
ptr[1] = ptr[ 5] = pwdlen;
|
||||
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||
ptr[3] = ptr[ 7] = timeCost;
|
||||
ptr[8] = ptr[12] = nRows;
|
||||
ptr[9] = ptr[13] = nCols;
|
||||
ptr[10] = ptr[14] = 0x80;
|
||||
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
|
||||
|
||||
reducedDuplexRow1_2way( state, &wholeMatrix[0],
|
||||
&wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols );
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64],
|
||||
nCols );
|
||||
|
||||
rowa0 = (rowa0 + step) & (window - 1);
|
||||
|
||||
prev = row;
|
||||
row++;
|
||||
|
||||
if ( rowa0 == 0 )
|
||||
{
|
||||
step = window + gap;
|
||||
window *= 2;
|
||||
gap = -gap;
|
||||
}
|
||||
} while ( row < nRows );
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0;
|
||||
for ( tau = 1; tau <= timeCost; tau++ )
|
||||
{
|
||||
step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
|
||||
do
|
||||
{
|
||||
rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
|
||||
rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
prev = row;
|
||||
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
|
||||
//Squeezes the key
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This version is currently only used by REv3 and has some hard coding
|
||||
// specific to v3 such as input data size of 32 bytes.
|
||||
//
|
||||
// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
|
||||
// they can be merged.
|
||||
//
|
||||
// RE is used by RE, allium. The main difference between RE and REv2
|
||||
// in the matrix size.
|
||||
//
|
||||
// Z also needs to support 80 byte input as well as 32 byte, and odd
|
||||
// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
|
||||
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
// 2 way 256
|
||||
// drop salt, salt len arguments, hard code some others.
|
||||
// Data is interleaved 2x256.
|
||||
|
||||
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||
const void *pwd, uint64_t pwdlen, uint64_t timeCost,
|
||||
uint64_t nRows, uint64_t nCols )
|
||||
|
||||
// hard coded for 32 byte input as well as matrix size.
|
||||
// Other required versions include 80 byte input and different block
|
||||
// sizes.
|
||||
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[32];
|
||||
int64_t row = 2;
|
||||
int64_t prev = 1;
|
||||
int64_t rowa0 = 0;
|
||||
int64_t rowa1 = 0;
|
||||
int64_t tau;
|
||||
int64_t step = 1;
|
||||
int64_t window = 2;
|
||||
int64_t gap = 1;
|
||||
uint64_t instance0 = 0;
|
||||
uint64_t instance1 = 0;
|
||||
//====================================================================/
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
||||
// need to build in parallel as pw isalready interleaved.
|
||||
|
||||
|
||||
// { password, (64 or 80 bytes)
|
||||
// salt, (64 or 80 bytes) = same as password
|
||||
// Klen, (u64) = 32 bytes
|
||||
// pwdlen, (u64)
|
||||
// saltlen, (u64)
|
||||
// timecost, (u64)
|
||||
// nrows, (u64)
|
||||
// ncols, (u64)
|
||||
// 0x80, (byte)
|
||||
// { 0 .. 0 },
|
||||
// 1 (byte)
|
||||
// }
|
||||
|
||||
// input is usually 32 maybe 64, both are aligned to 256 bit vector.
|
||||
// 80 byte inpput is not aligned complicating matters for lyra2z.
|
||||
|
||||
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
uint64_t *ptr = wholeMatrix;
|
||||
uint64_t *pw = (uint64_t*)pwd;
|
||||
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||
ptr += pwdlen>>2;
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||
ptr += pwdlen>>2;
|
||||
|
||||
// now build the rest interleaving on the fly.
|
||||
|
||||
ptr[0] = ptr[ 4] = kLen;
|
||||
ptr[1] = ptr[ 5] = pwdlen;
|
||||
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||
ptr[3] = ptr[ 7] = timeCost;
|
||||
ptr[8] = ptr[12] = nRows;
|
||||
ptr[9] = ptr[13] = nCols;
|
||||
ptr[10] = ptr[14] = 0x80;
|
||||
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
|
||||
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
|
||||
|
||||
reducedDuplexRow1_2way( state, &wholeMatrix[0],
|
||||
&wholeMatrix[2*ROW_LEN_INT64], nCols );
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
rowa0 = (rowa0 + step) & (window - 1);
|
||||
|
||||
prev = row;
|
||||
row++;
|
||||
|
||||
if (rowa0 == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
row = 0;
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
|
||||
do
|
||||
{
|
||||
instance0 = state[ offset_to_index( instance0 ) ];
|
||||
instance1 = (&state[4])[ offset_to_index( instance1 ) ];
|
||||
|
||||
rowa0 = state[ offset_to_index( instance0 ) ]
|
||||
& (unsigned int)(nRows-1);
|
||||
rowa1 = (state+4)[ offset_to_index( instance1 ) ]
|
||||
& (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
prev = row;
|
||||
row = (row + step) & (unsigned int)(nRows-1);
|
||||
|
||||
} while ( row != 0 );
|
||||
}
|
||||
|
||||
absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
|
||||
&wholeMatrix[2*rowa1*ROW_LEN_INT64] );
|
||||
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||
const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[32];
|
||||
int64_t row = 2;
|
||||
int64_t prev = 1;
|
||||
int64_t rowa0 = 0;
|
||||
int64_t rowa1 = 0;
|
||||
int64_t tau;
|
||||
int64_t step = 1;
|
||||
int64_t window = 2;
|
||||
int64_t gap = 1;
|
||||
//=======================================================================/
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
|
||||
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
uint64_t *ptr = wholeMatrix;
|
||||
uint64_t *pw = (uint64_t*)pwd;
|
||||
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||
ptr += pwdlen>>2;
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||
ptr += pwdlen>>2;
|
||||
|
||||
// now build the rest interleaving on the fly.
|
||||
ptr[0] = ptr[ 4] = kLen;
|
||||
ptr[1] = ptr[ 5] = pwdlen;
|
||||
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||
ptr[3] = ptr[ 7] = timeCost;
|
||||
ptr[8] = ptr[12] = nRows;
|
||||
ptr[9] = ptr[13] = nCols;
|
||||
ptr[10] = ptr[14] = 0x80;
|
||||
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
|
||||
BLOCK_LEN_BLAKE2_SAFE_INT64 );
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
|
||||
|
||||
reducedDuplexRow1_2way( state, &wholeMatrix[0],
|
||||
&wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols );
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64],
|
||||
nCols );
|
||||
|
||||
rowa0 = (rowa0 + step) & (window - 1);
|
||||
prev = row;
|
||||
row++;
|
||||
|
||||
if ( rowa0 == 0 )
|
||||
{
|
||||
step = window + gap;
|
||||
window *= 2;
|
||||
gap = -gap;
|
||||
}
|
||||
} while ( row < nRows );
|
||||
|
||||
row = 0;
|
||||
for ( tau = 1; tau <= timeCost; tau++ )
|
||||
{
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
rowa0 = state[ 0 ] % nRows;
|
||||
rowa1 = state[ 4 ] % nRows;
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
prev = row;
|
||||
row = (row + step) % nRows;
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
|
||||
|
||||
//Squeezes the key
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
|
||||
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[32];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa0 = 0;
|
||||
int64_t rowa1 = 0;
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
uint64_t *pw = (uint64_t*)pwd;
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
uint64_t *ptr = wholeMatrix;
|
||||
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||
ptr += pwdlen>>2;
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||
ptr += pwdlen>>2;
|
||||
|
||||
// now build the rest interleaving on the fly.
|
||||
|
||||
ptr[0] = ptr[ 4] = kLen;
|
||||
ptr[1] = ptr[ 5] = pwdlen;
|
||||
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||
ptr[3] = ptr[ 7] = timeCost;
|
||||
ptr[8] = ptr[12] = nRows;
|
||||
ptr[9] = ptr[13] = nCols;
|
||||
ptr[10] = ptr[14] = 0x80;
|
||||
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||
|
||||
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1_2way( state, &wholeMatrix[0],
|
||||
&wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa0 = (rowa0 + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa0 == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
|
||||
do
|
||||
{
|
||||
rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
|
||||
rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
|
||||
//Squeezes the key
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -327,6 +327,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
@@ -62,17 +62,12 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
//int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
// uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
|
||||
void lyra2h_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2h_4way_blake_mid );
|
||||
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
|
||||
blake256_4way( &l2h_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2h_4way_hash( void *state, const void *input )
|
||||
|
@@ -1,150 +1,13 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#if defined (LYRA2REV2_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
||||
#if defined (LYRA2REV2_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
skein256_8way_context skein;
|
||||
bmw256_8way_context bmw;
|
||||
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
|
||||
|
||||
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
|
||||
|
||||
bool init_lyra2rev2_8way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &l2v2_8way_ctx.keccak );
|
||||
cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &l2v2_8way_ctx.skein );
|
||||
bmw256_8way_init( &l2v2_8way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
void lyra2rev2_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
|
||||
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
|
||||
rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
|
||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
|
||||
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
|
||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
|
||||
blake256_8way_init( &l2v2_8way_ctx.blake );
|
||||
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2rev2_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (LYRA2REV2_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
@@ -176,12 +39,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
||||
|
||||
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
|
||||
rintrlv_4x32_4x64( vhash64, vhash, 256 );
|
||||
|
||||
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
@@ -201,7 +64,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way_update( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
@@ -217,7 +80,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
@@ -242,7 +105,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
|
||||
blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
|
@@ -4,180 +4,8 @@
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
||||
#if defined (LYRA2REV3_16WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
cube_4way_context cube;
|
||||
bmw256_16way_context bmw;
|
||||
} lyra2v3_16way_ctx_holder;
|
||||
|
||||
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
|
||||
|
||||
bool init_lyra2rev3_16way_ctx()
|
||||
{
|
||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
|
||||
bmw256_16way_init( &l2v3_16way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
void lyra2rev3_16way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
||||
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
|
||||
|
||||
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
|
||||
blake256_16way_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_16way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
|
||||
int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &hash[7<<4];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
|
||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
|
||||
n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n ) );
|
||||
|
||||
lyra2rev3_16way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (LYRA2REV3_8WAY)
|
||||
#if defined (LYRA2REV3_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
@@ -209,7 +37,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
|
||||
|
||||
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
|
||||
blake256_8way( &ctx.blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
@@ -252,7 +80,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8way_close( &ctx.bmw, state );
|
||||
|
||||
}
|
||||
@@ -277,7 +105,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
|
||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -334,7 +162,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
|
||||
|
||||
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
|
||||
// blake256_4way( &ctx.blake, input, 80 );
|
||||
blake256_4way( &ctx.blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
@@ -357,7 +186,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
@@ -382,7 +211,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
|
||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
||||
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||
blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
|
@@ -1,240 +1,13 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#ifdef LYRA2Z_4WAY
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
__thread uint64_t* lyra2z_16way_matrix;
|
||||
|
||||
bool lyra2z_16way_thread_init()
|
||||
{
|
||||
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_16way_context l2z_16way_blake_mid;
|
||||
|
||||
void lyra2z_16way_midstate( const void* input )
|
||||
{
|
||||
blake256_16way_init( &l2z_16way_blake_mid );
|
||||
blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_16way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
||||
blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
|
||||
blake256_16way_update( &ctx_blake, input + (64*16), 16 );
|
||||
blake256_16way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
memcpy( state+256, hash8, 32 );
|
||||
memcpy( state+288, hash9, 32 );
|
||||
memcpy( state+320, hash10, 32 );
|
||||
memcpy( state+352, hash11, 32 );
|
||||
memcpy( state+384, hash12, 32 );
|
||||
memcpy( state+416, hash13, 32 );
|
||||
memcpy( state+448, hash14, 32 );
|
||||
memcpy( state+480, hash15, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
lyra2z_16way_midstate( vdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
|
||||
n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4,
|
||||
n+ 3, n+ 2, n+ 1, n ) );
|
||||
lyra2z_16way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
|
||||
__thread uint64_t* lyra2z_8way_matrix;
|
||||
|
||||
bool lyra2z_8way_thread_init()
|
||||
{
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_8way_context l2z_8way_blake_mid;
|
||||
|
||||
void lyra2z_8way_midstate( const void* input )
|
||||
{
|
||||
blake256_8way_init( &l2z_8way_blake_mid );
|
||||
blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
|
||||
blake256_8way_update( &ctx_blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
||||
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
lyra2z_8way_midstate( vdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
lyra2z_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
|
||||
__thread uint64_t* lyra2z_4way_matrix;
|
||||
|
||||
bool lyra2z_4way_thread_init()
|
||||
@@ -247,7 +20,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
|
||||
void lyra2z_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2z_4way_blake_mid );
|
||||
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
|
||||
blake256_4way( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input )
|
||||
@@ -260,7 +33,7 @@ void lyra2z_4way_hash( void *state, const void *input )
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
@@ -312,3 +85,100 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(LYRA2Z_8WAY)
|
||||
|
||||
__thread uint64_t* lyra2z_8way_matrix;
|
||||
|
||||
bool lyra2z_8way_thread_init()
|
||||
{
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_8way_context l2z_8way_blake_mid;
|
||||
|
||||
void lyra2z_8way_midstate( const void* input )
|
||||
{
|
||||
blake256_8way_init( &l2z_8way_blake_mid );
|
||||
blake256_8way( &l2z_8way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
|
||||
blake256_8way( &ctx_blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
lyra2z_8way_midstate( vdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32(
|
||||
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
|
||||
lyra2z_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -19,7 +19,7 @@
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
//#include "algo-gate.h"
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
@@ -27,7 +27,8 @@
|
||||
#include "sponge.h"
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if 0
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
@@ -40,26 +41,19 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_512( out, state, BLOCK_LEN_M256I );
|
||||
LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I;
|
||||
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
||||
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I*2;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
|
||||
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
|
||||
}
|
||||
|
||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
|
||||
const uint64_t *In1 )
|
||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
||||
{
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i in[3];
|
||||
casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
|
||||
casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
|
||||
casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
|
||||
casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
|
||||
casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
|
||||
casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
|
||||
|
||||
__m512i *in = (__m512i*)In;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
@@ -97,7 +91,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
|
||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
In += block_len*2;
|
||||
In += block_len * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -116,7 +110,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -133,13 +127,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
{
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
out[2] = state2;
|
||||
|
||||
//Goes to next block (column) that will receive the squeezed data
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
}
|
||||
@@ -150,14 +144,15 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
// This function has to deal with gathering 2 256 bit rowin vectors from
|
||||
// non-contiguous memory. Extra work and performance penalty.
|
||||
|
||||
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m512i*)rowIn;
|
||||
__m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i *in = (__m256i*)rowIn;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -177,25 +172,28 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//Input: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
in0 += BLOCK_LEN_M256I;
|
||||
in1 += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
_mm512_store_si256( (__m512i*)State, state0 );
|
||||
_mm512_store_si256( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si256( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si256( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
}
|
||||
|
||||
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* in = (__m512i*)rowIn;
|
||||
__m512i* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
@@ -212,7 +210,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( in[2], inout[2] ) );
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
|
||||
|
||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
@@ -224,18 +222,17 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
|
||||
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -244,61 +241,49 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
// big ugly workaound for pointer aliasing, use a union of pointers.
|
||||
// Access matrix using m512i for in and out, m256i for inout
|
||||
|
||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||
uint64_t *rowOut, uint64_t nCols)
|
||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
||||
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m512i*)rowIn;
|
||||
__m256i *inout0 = (__m256i*)rowInOut0;
|
||||
__m256i *inout1 = (__m256i*)rowInOut1;
|
||||
__m512i *out = (__m512i*)rowOut;
|
||||
__m512i io[3];
|
||||
povly inout;
|
||||
inout.v512 = &io[0];
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m2512* in = (__m512i*)rowIn;
|
||||
__m2512* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut;
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
_mm_prefetch( in0, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
_mm_prefetch( in, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 6, _MM_HINT_T0 );
|
||||
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
inout.v256[0] = inout0[0];
|
||||
inout.v256[1] = inout1[1];
|
||||
inout.v256[2] = inout0[2];
|
||||
inout.v256[3] = inout1[3];
|
||||
inout.v256[4] = inout0[4];
|
||||
inout.v256[5] = inout1[5];
|
||||
|
||||
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
|
||||
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
|
||||
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
|
||||
t0 = mm512_concat_256( in1[0], in0[0] );
|
||||
t1 = mm512_concat_256( in1[1], in0[1] );
|
||||
t2 = mm512_concat_256( in1[2], in0[2] );
|
||||
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( in[0], inout.v512[0] ) );
|
||||
_mm512_add_epi64( t0, inout[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( in[1], inout.v512[1] ) );
|
||||
_mm512_add_epi64( t1, inout[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( in[2], inout.v512[2] ) );
|
||||
|
||||
_mm512_add_epi64( t2, inout[2] ) );
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
@@ -308,44 +293,22 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
|
||||
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||
out[2] = _mm512_xor_si512( out[2], state2 );
|
||||
|
||||
// if inout is the same row as out it was just overwritten, reload.
|
||||
if ( rowOut == rowInOut0 )
|
||||
{
|
||||
inout.v256[0] = inout0[0];
|
||||
inout.v256[2] = inout0[2];
|
||||
inout.v256[4] = inout0[4];
|
||||
}
|
||||
if ( rowOut == rowInOut1 )
|
||||
{
|
||||
inout.v256[1] = inout1[1];
|
||||
inout.v256[3] = inout1[3];
|
||||
inout.v256[5] = inout1[5];
|
||||
}
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout.v512[0] = _mm512_xor_si512( inout.v512[0],
|
||||
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
|
||||
inout.v512[1] = _mm512_xor_si512( inout.v512[1],
|
||||
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
|
||||
inout.v512[2] = _mm512_xor_si512( inout.v512[2],
|
||||
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
|
||||
|
||||
inout0[0] = inout.v256[0];
|
||||
inout1[1] = inout.v256[1];
|
||||
inout0[2] = inout.v256[2];
|
||||
inout1[3] = inout.v256[3];
|
||||
inout0[4] = inout.v256[4];
|
||||
inout1[5] = inout.v256[5];
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I * 2;
|
||||
inout1 += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
|
@@ -375,10 +375,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
|
||||
{
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
//printf("S RSR0 col= %d, out= %x\n",i,out);
|
||||
|
||||
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
out[2] = state2;
|
||||
@@ -709,34 +706,11 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
|
||||
out[1] = _mm256_xor_si256( state1, in[1] );
|
||||
out[2] = _mm256_xor_si256( state2, in[2] );
|
||||
|
||||
/*
|
||||
printf("s duplexsetup col= %d\n",i);
|
||||
uint64_t * o = (uint64_t*)out;
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
|
||||
printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
|
||||
*/
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
t0 = _mm256_permute4x64_epi64( state0, 0x93 );
|
||||
t1 = _mm256_permute4x64_epi64( state1, 0x93 );
|
||||
t2 = _mm256_permute4x64_epi64( state2, 0x93 );
|
||||
|
||||
/*
|
||||
uint64_t *t = (uint64_t*)&t0;
|
||||
printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
|
||||
|
||||
o = (uint64_t*)inout;
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
|
||||
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
|
||||
*/
|
||||
inout[0] = _mm256_xor_si256( inout[0],
|
||||
_mm256_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm256_xor_si256( inout[1],
|
||||
@@ -744,17 +718,7 @@ printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
|
||||
inout[2] = _mm256_xor_si256( inout[2],
|
||||
_mm256_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
/*
|
||||
o = (uint64_t*)inout;
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
|
||||
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
|
||||
*/
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
@@ -985,22 +949,6 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
|
||||
_mm_prefetch( inout + 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout + 11, _MM_HINT_T0 );
|
||||
|
||||
/*
|
||||
uint64_t *io = (uint64_t*)inout;
|
||||
uint64_t *ii = (uint64_t*)in;
|
||||
|
||||
printf("RDRS1 col= %d\n", i);
|
||||
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
|
||||
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
|
||||
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
|
||||
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
|
||||
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
|
||||
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
|
||||
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
|
||||
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
|
||||
*/
|
||||
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
state0 = _mm256_xor_si256( state0,
|
||||
_mm256_add_epi64( in[0], inout[0] ) );
|
||||
|
@@ -203,36 +203,24 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
union _povly
|
||||
{
|
||||
__m512i *v512;
|
||||
__m256i *v256;
|
||||
uint64_t *u64;
|
||||
};
|
||||
typedef union _povly povly;
|
||||
|
||||
//---- Housekeeping
|
||||
void initState_2way( uint64_t State[/*16*/] );
|
||||
void initState_2way( uint64_t state[/*16*/] );
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
|
||||
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
|
||||
const uint64_t *In1 );
|
||||
void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
|
||||
const uint64_t nBlocks, const uint64_t block_len );
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
|
||||
void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
|
||||
keccak512_4way_context ctx_keccak;
|
||||
|
||||
blake512_4way_init( &ctx_blake );
|
||||
blake512_4way_update( &ctx_blake, input, 80 );
|
||||
blake512_4way( &ctx_blake, input, 80 );
|
||||
blake512_4way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
jh512_4way_init( &ctx_jh );
|
||||
jh512_4way_update( &ctx_jh, vhash, 64 );
|
||||
jh512_4way( &ctx_jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx_jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx_keccak );
|
||||
keccak512_4way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx_keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
skein512_4way_init( &ctx_skein );
|
||||
skein512_4way_update( &ctx_skein, vhash, 64 );
|
||||
skein512_4way( &ctx_skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx_skein, out );
|
||||
}
|
||||
|
||||
|
@@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input )
|
||||
anime_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
|
||||
|
||||
bmw512_4way_update( &ctx.bmw, input, 80 );
|
||||
bmw512_4way( &ctx.bmw, input, 80 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
blake512_4way_update( &ctx.blake, vhash, 64 );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way_update( &ctx.blake, vhash, 64 );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
}
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -2,10 +2,7 @@
|
||||
|
||||
bool register_hmq1725_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(HMQ1725_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_hmq1725_8way;
|
||||
gate->hash = (void*)&hmq1725_8way_hash;
|
||||
#elif defined(HMQ1725_4WAY)
|
||||
#if defined(HMQ1725_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_hmq1725_4way;
|
||||
gate->hash = (void*)&hmq1725_4way_hash;
|
||||
#else
|
||||
@@ -13,7 +10,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_hmq1725;
|
||||
gate->hash = (void*)&hmq1725hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -4,21 +4,13 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define HMQ1725_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define HMQ1725_4WAY 1
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
// #define HMQ1725_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_hmq1725_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(HMQ1725_8WAY)
|
||||
|
||||
void hmq1725_8way_hash( void *state, const void *input );
|
||||
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(HMQ1725_4WAY)
|
||||
#if defined(HMQ1725_4WAY)
|
||||
|
||||
void hmq1725_4way_hash( void *state, const void *input );
|
||||
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -333,7 +333,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
if (((hash64[7]&0xFFFFFFFF)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -347,7 +346,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
if (((hash64[7]&0xFFFFFFF0)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -361,7 +359,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
if (((hash64[7]&0xFFFFFF00)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -375,7 +372,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
if (((hash64[7]&0xFFFFF000)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -390,7 +386,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
if (((hash64[7]&0xFFFF0000)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
@@ -404,7 +399,6 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -9,23 +9,16 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined (QUARK_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
} quark_8way_ctx_holder;
|
||||
|
||||
quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
|
||||
@@ -34,14 +27,10 @@ void init_quark_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &quark_8way_ctx.blake );
|
||||
bmw512_8way_init( &quark_8way_ctx.bmw );
|
||||
init_groestl( &quark_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &quark_8way_ctx.skein );
|
||||
jh512_8way_init( &quark_8way_ctx.jh );
|
||||
keccak512_8way_init( &quark_8way_ctx.keccak );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &quark_8way_ctx.groestl, 64 );
|
||||
#else
|
||||
init_groestl( &quark_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void quark_8way_hash( void *state, const void *input )
|
||||
@@ -49,7 +38,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -61,7 +49,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
__m512i* vh = (__m512i*)vhash;
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__m512i* vhC = (__m512i*)vhashC;
|
||||
__mmask8 vh_mask;
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
@@ -76,28 +63,23 @@ void quark_8way_hash( void *state, const void *input )
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// AVX 512 cmpeq returns a bit mask instead of a vector mask.
|
||||
// This should simplify things but the logic doesn't seem to be working.
|
||||
// The problem appears to be related to the test to skip a hash if it isn't
|
||||
// to be used. Skipping the test for all 8 way hashes seems to have
|
||||
// fixed it. The hash selection blending works if the hash is produced
|
||||
// but the hash wasn't being produced when it should.
|
||||
// Both decisions are based on the same data, the __mmask8. It works
|
||||
// as a blend mask but not in a logical comparison, maybe the type is the
|
||||
// problem. Maybe a cast to int or movm is needed to make it work.
|
||||
// It's now moot because the hash can only be skipped 1 in 256 iterations
|
||||
// when hashing parallel 8 ways.
|
||||
// The performance impact of the workaround should be negligible.
|
||||
// It's a problem for another day.
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
if ( ( vh_mask & 0x0f ) != 0x0f )
|
||||
{
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
}
|
||||
if ( ( vh_mask & 0xf0 ) != 0xf0 )
|
||||
{
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
}
|
||||
rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
@@ -149,31 +131,16 @@ void quark_8way_hash( void *state, const void *input )
|
||||
(char*)hash7, 512 );
|
||||
}
|
||||
|
||||
intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhashB );
|
||||
}
|
||||
|
||||
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
@@ -198,8 +165,6 @@ void quark_8way_hash( void *state, const void *input )
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
512 );
|
||||
|
||||
#endif
|
||||
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
@@ -338,10 +303,10 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
|
||||
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -376,7 +341,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
}
|
||||
|
||||
@@ -395,7 +360,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -403,24 +368,24 @@ void quark_4way_hash( void *state, const void *input )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way_update( &ctx.blake, vhash, 64 );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
}
|
||||
|
||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
skein512_4way_init( &ctx.skein );
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
@@ -428,14 +393,14 @@ void quark_4way_hash( void *state, const void *input )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
}
|
||||
|
||||
|
@@ -15,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -9,10 +9,6 @@
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
@@ -20,14 +16,10 @@ typedef struct
|
||||
{
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
#if defined(__VAES__)
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
simd_2way_context simd2;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} qubit_4way_ctx_holder;
|
||||
|
||||
qubit_4way_ctx_holder qubit_4way_ctx;
|
||||
@@ -35,14 +27,10 @@ qubit_4way_ctx_holder qubit_4way_ctx;
|
||||
void init_qubit_4way_ctx()
|
||||
{
|
||||
cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
shavite512_4way_init( &qubit_4way_ctx.shavite );
|
||||
echo_4way_init( &qubit_4way_ctx.echo, 512 );
|
||||
#else
|
||||
sph_shavite512_init( &qubit_4way_ctx.shavite );
|
||||
init_echo( &qubit_4way_ctx.echo, 512 );
|
||||
#endif
|
||||
simd_2way_init( &qubit_4way_ctx.simd2, 512 );
|
||||
init_echo(&qubit_4way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void qubit_4way_hash( void *output, const void *input )
|
||||
@@ -60,13 +48,6 @@ void qubit_4way_hash( void *output, const void *input )
|
||||
luffa_4way_close( &ctx.luffa, vhash );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
@@ -85,44 +66,31 @@ void qubit_4way_hash( void *output, const void *input )
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
|
||||
|
||||
dintrlv_4x128( output, output+32, output+64, output+96, vhash, 256 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
#endif
|
||||
}
|
||||
|
||||
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
@@ -16,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_qubit;
|
||||
gate->hash = (void*)&qubit_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,147 +7,16 @@
|
||||
#include "ripemd-hash-4way.h"
|
||||
|
||||
#define LBRY_INPUT_SIZE 112
|
||||
#define LBRY_MIDSTATE 96
|
||||
#define LBRY_MIDSTATE 64
|
||||
#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
|
||||
|
||||
#if defined(LBRY_16WAY)
|
||||
|
||||
static __thread sha256_16way_context sha256_16w_mid;
|
||||
|
||||
void lbry_16way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t _ALIGN(128) vhashA[16<<4];
|
||||
uint32_t _ALIGN(64) vhashB[16<<4];
|
||||
uint32_t _ALIGN(64) vhashC[16<<4];
|
||||
uint32_t _ALIGN(64) h0[32];
|
||||
uint32_t _ALIGN(64) h1[32];
|
||||
uint32_t _ALIGN(64) h2[32];
|
||||
uint32_t _ALIGN(64) h3[32];
|
||||
uint32_t _ALIGN(64) h4[32];
|
||||
uint32_t _ALIGN(64) h5[32];
|
||||
uint32_t _ALIGN(64) h6[32];
|
||||
uint32_t _ALIGN(64) h7[32];
|
||||
uint32_t _ALIGN(64) h8[32];
|
||||
uint32_t _ALIGN(64) h9[32];
|
||||
uint32_t _ALIGN(64) h10[32];
|
||||
uint32_t _ALIGN(64) h11[32];
|
||||
uint32_t _ALIGN(64) h12[32];
|
||||
uint32_t _ALIGN(64) h13[32];
|
||||
uint32_t _ALIGN(64) h14[32];
|
||||
uint32_t _ALIGN(64) h15[32];
|
||||
sha256_16way_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||
sha512_8way_context ctx_sha512;
|
||||
ripemd160_16way_context ctx_ripemd;
|
||||
|
||||
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
|
||||
sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
|
||||
sha256_16way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_16way_init( &ctx_sha256 );
|
||||
sha256_16way_update( &ctx_sha256, vhashA, 32 );
|
||||
sha256_16way_close( &ctx_sha256, vhashA );
|
||||
|
||||
// reinterleave to do sha512 4-way 64 bit twice.
|
||||
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
|
||||
h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
|
||||
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
|
||||
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
|
||||
|
||||
sha512_8way_init( &ctx_sha512 );
|
||||
sha512_8way_update( &ctx_sha512, vhashA, 32 );
|
||||
sha512_8way_close( &ctx_sha512, vhashA );
|
||||
|
||||
sha512_8way_init( &ctx_sha512 );
|
||||
sha512_8way_update( &ctx_sha512, vhashB, 32 );
|
||||
sha512_8way_close( &ctx_sha512, vhashB );
|
||||
|
||||
// back to 8-way 32 bit
|
||||
dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
|
||||
dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
|
||||
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
|
||||
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
|
||||
|
||||
ripemd160_16way_init( &ctx_ripemd );
|
||||
ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
|
||||
ripemd160_16way_close( &ctx_ripemd, vhashB );
|
||||
|
||||
ripemd160_16way_init( &ctx_ripemd );
|
||||
ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
|
||||
ripemd160_16way_close( &ctx_ripemd, vhashC );
|
||||
|
||||
sha256_16way_init( &ctx_sha256 );
|
||||
sha256_16way_update( &ctx_sha256, vhashB, 20 );
|
||||
sha256_16way_update( &ctx_sha256, vhashC, 20 );
|
||||
sha256_16way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_16way_init( &ctx_sha256 );
|
||||
sha256_16way_update( &ctx_sha256, vhashA, 32 );
|
||||
sha256_16way_close( &ctx_sha256, output );
|
||||
}
|
||||
|
||||
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[32*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t edata[32] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[27];
|
||||
const uint32_t first_nonce = pdata[27];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m512i *noncev = (__m512i*)vdata + 27; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
||||
|
||||
sha256_16way_init( &sha256_16w_mid );
|
||||
sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
lbry_16way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
if ( unlikely( hash7[ i ] <= Htarg ) )
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
extr_lane_16x32( lane_hash, hash, i, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[27] = n + i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(LBRY_8WAY)
|
||||
#if defined(LBRY_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_8w_mid;
|
||||
|
||||
void lbry_8way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t _ALIGN(128) vhashA[16<<3];
|
||||
uint32_t _ALIGN(64) vhashA[16<<3];
|
||||
uint32_t _ALIGN(64) vhashB[16<<3];
|
||||
uint32_t _ALIGN(64) vhashC[16<<3];
|
||||
uint32_t _ALIGN(32) h0[32];
|
||||
@@ -163,11 +32,11 @@ void lbry_8way_hash( void* output, const void* input )
|
||||
ripemd160_8way_context ctx_ripemd;
|
||||
|
||||
memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
|
||||
sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
|
||||
sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
|
||||
sha256_8way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_8way_init( &ctx_sha256 );
|
||||
sha256_8way_update( &ctx_sha256, vhashA, 32 );
|
||||
sha256_8way( &ctx_sha256, vhashA, 32 );
|
||||
sha256_8way_close( &ctx_sha256, vhashA );
|
||||
|
||||
// reinterleave to do sha512 4-way 64 bit twice.
|
||||
@@ -176,11 +45,11 @@ void lbry_8way_hash( void* output, const void* input )
|
||||
intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
|
||||
|
||||
sha512_4way_init( &ctx_sha512 );
|
||||
sha512_4way_update( &ctx_sha512, vhashA, 32 );
|
||||
sha512_4way( &ctx_sha512, vhashA, 32 );
|
||||
sha512_4way_close( &ctx_sha512, vhashA );
|
||||
|
||||
sha512_4way_init( &ctx_sha512 );
|
||||
sha512_4way_update( &ctx_sha512, vhashB, 32 );
|
||||
sha512_4way( &ctx_sha512, vhashB, 32 );
|
||||
sha512_4way_close( &ctx_sha512, vhashB );
|
||||
|
||||
// back to 8-way 32 bit
|
||||
@@ -189,20 +58,20 @@ void lbry_8way_hash( void* output, const void* input )
|
||||
intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
|
||||
|
||||
ripemd160_8way_init( &ctx_ripemd );
|
||||
ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
|
||||
ripemd160_8way( &ctx_ripemd, vhashA, 32 );
|
||||
ripemd160_8way_close( &ctx_ripemd, vhashB );
|
||||
|
||||
ripemd160_8way_init( &ctx_ripemd );
|
||||
ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
|
||||
ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
|
||||
ripemd160_8way_close( &ctx_ripemd, vhashC );
|
||||
|
||||
sha256_8way_init( &ctx_sha256 );
|
||||
sha256_8way_update( &ctx_sha256, vhashB, 20 );
|
||||
sha256_8way_update( &ctx_sha256, vhashC, 20 );
|
||||
sha256_8way( &ctx_sha256, vhashB, 20 );
|
||||
sha256_8way( &ctx_sha256, vhashC, 20 );
|
||||
sha256_8way_close( &ctx_sha256, vhashA );
|
||||
|
||||
sha256_8way_init( &ctx_sha256 );
|
||||
sha256_8way_update( &ctx_sha256, vhashA, 32 );
|
||||
sha256_8way( &ctx_sha256, vhashA, 32 );
|
||||
sha256_8way_close( &ctx_sha256, output );
|
||||
}
|
||||
|
||||
@@ -212,16 +81,21 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[32*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t edata[32] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[27];
|
||||
const uint32_t first_nonce = pdata[27];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t edata[32] __attribute__ ((aligned (64)));
|
||||
__m256i *noncev = (__m256i*)vdata + 27; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
@@ -232,30 +106,33 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 1024 );
|
||||
|
||||
edata, edata, edata, edata, 1024 );
|
||||
sha256_8way_init( &sha256_8w_mid );
|
||||
sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
|
||||
sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
|
||||
|
||||
do
|
||||
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32(
|
||||
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
|
||||
lbry_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( hash7[ i ] <= Htarg ) )
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
// deinterleave hash for lane
|
||||
extr_lane_8x32( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32(
|
||||
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
|
||||
lbry_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
|
||||
{
|
||||
pdata[27] = n + i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
// deinterleave hash for lane
|
||||
extr_lane_8x32( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[27] = n + i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -98,23 +98,16 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
{
|
||||
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#if defined (LBRY_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_16way;
|
||||
gate->hash = (void*)&lbry_16way_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#elif defined (LBRY_8WAY)
|
||||
gate->optimizations = AVX2_OPT | SHA_OPT;
|
||||
#if defined (LBRY_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_8way;
|
||||
gate->hash = (void*)&lbry_8way_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#elif defined (LBRY_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_4way;
|
||||
gate->hash = (void*)&lbry_4way_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_lbry;
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#endif
|
||||
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
|
@@ -4,19 +4,11 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LBRY_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LBRY_8WAY 1
|
||||
#endif
|
||||
/*
|
||||
#if !defined(__SHA__)
|
||||
#if defined(__AVX2__)
|
||||
#define LBRY_8WAY
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define LBRY_NTIME_INDEX 25
|
||||
#define LBRY_NBITS_INDEX 26
|
||||
@@ -26,23 +18,18 @@
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(LBRY_16WAY)
|
||||
|
||||
void lbry_16way_hash( void *state, const void *input );
|
||||
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#elif defined(LBRY_8WAY)
|
||||
#if defined(LBRY_8WAY)
|
||||
|
||||
void lbry_8way_hash( void *state, const void *input );
|
||||
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
/*
|
||||
#elif defined(LBRY_4WAY)
|
||||
|
||||
void lbry_4way_hash( void *state, const void *input );
|
||||
int scanhash_lbry_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
*/
|
||||
#else
|
||||
|
||||
void lbry_hash( void *state, const void *input );
|
||||
|
@@ -80,6 +80,9 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 32 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < sizeof(masks); m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -87,11 +90,23 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
|
||||
pdata[27] = ++n;
|
||||
be32enc(&endiandata[27], n);
|
||||
lbry_hash(hash64, &endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
|
||||
pdata[27] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@@ -259,8 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
|
||||
sc->count_high = sc->count_low = 0;
|
||||
}
|
||||
|
||||
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
||||
size_t len )
|
||||
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
size_t ptr;
|
||||
@@ -560,8 +559,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
|
||||
sc->count_high = sc->count_low = 0;
|
||||
}
|
||||
|
||||
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
||||
size_t len )
|
||||
void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
@@ -625,303 +623,3 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// RIPEMD-160 16 way
|
||||
|
||||
|
||||
#define F16W_1(x, y, z) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
|
||||
|
||||
#define F16W_2(x, y, z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
|
||||
|
||||
#define F16W_3(x, y, z) \
|
||||
_mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
|
||||
|
||||
#define F16W_4(x, y, z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
|
||||
|
||||
#define F16W_5(x, y, z) \
|
||||
_mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
|
||||
|
||||
#define RR_16W(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
|
||||
_mm512_add_epi32( a, f( b ,c, d ) ), r ), \
|
||||
m512_const1_64( k ) ), s ), e ); \
|
||||
c = mm512_rol_32( c, 10 );\
|
||||
} while (0)
|
||||
|
||||
#define ROUND1_16W(a, b, c, d, e, f, s, r, k) \
|
||||
RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
|
||||
|
||||
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
|
||||
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
||||
|
||||
static void ripemd160_16way_round( ripemd160_16way_context *sc )
|
||||
{
|
||||
const __m512i *in = (__m512i*)sc->buf;
|
||||
__m512i *h = (__m512i*)sc->val;
|
||||
register __m512i A1, B1, C1, D1, E1;
|
||||
register __m512i A2, B2, C2, D2, E2;
|
||||
__m512i tmp;
|
||||
|
||||
A1 = A2 = h[0];
|
||||
B1 = B2 = h[1];
|
||||
C1 = C2 = h[2];
|
||||
D1 = D2 = h[3];
|
||||
E1 = E2 = h[4];
|
||||
|
||||
ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_1, 5, in[ 4], 1 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[ 5], 1 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_1, 7, in[ 6], 1 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_1, 9, in[ 7], 1 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_1, 6, in[12], 1 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_1, 7, in[13], 1 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_1, 9, in[14], 1 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_1, 8, in[15], 1 );
|
||||
|
||||
ROUND1_16W( E, A, B, C, D, F16W_2, 7, in[ 7], 2 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_2, 6, in[ 4], 2 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_2, 8, in[13], 2 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_2, 9, in[ 6], 2 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_2, 7, in[15], 2 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[12], 2 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_2, 9, in[ 5], 2 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_2, 7, in[14], 2 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
|
||||
|
||||
ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_3, 6, in[14], 3 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_3, 7, in[ 4], 3 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_3, 5, in[13], 3 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_3, 7, in[ 5], 3 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
|
||||
|
||||
ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_4, 9, in[12], 4 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[ 4], 4 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_4, 9, in[13], 4 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_4, 5, in[ 7], 4 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_4, 6, in[15], 4 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_4, 8, in[14], 4 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_4, 6, in[ 5], 4 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_4, 5, in[ 6], 4 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
|
||||
|
||||
ROUND1_16W( B, C, D, E, A, F16W_5, 9, in[ 4], 5 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_5, 5, in[ 5], 5 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_5, 6, in[ 7], 5 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_5, 8, in[12], 5 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_5, 5, in[14], 5 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
|
||||
ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
|
||||
ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
|
||||
ROUND1_16W( D, E, A, B, C, F16W_5, 8, in[ 6], 5 );
|
||||
ROUND1_16W( C, D, E, A, B, F16W_5, 5, in[15], 5 );
|
||||
ROUND1_16W( B, C, D, E, A, F16W_5, 6, in[13], 5 );
|
||||
|
||||
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[ 5], 1 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_5, 9, in[14], 1 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_5, 9, in[ 7], 1 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_5, 5, in[ 4], 1 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_5, 7, in[13], 1 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_5, 7, in[ 6], 1 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_5, 8, in[15], 1 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_5, 6, in[12], 1 );
|
||||
|
||||
ROUND2_16W( E, A, B, C, D, F16W_4, 9, in[ 6], 2 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[ 7], 2 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_4, 8, in[13], 2 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_4, 9, in[ 5], 2 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_4, 7, in[14], 2 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_4, 7, in[15], 2 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_4, 7, in[12], 2 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_4, 6, in[ 4], 2 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
|
||||
|
||||
ROUND2_16W( D, E, A, B, C, F16W_3, 9, in[15], 3 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_3, 7, in[ 5], 3 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_3, 8, in[ 7], 3 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_3, 6, in[14], 3 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_3, 6, in[ 6], 3 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[12], 3 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_3, 7, in[ 4], 3 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_3, 5, in[13], 3 );
|
||||
|
||||
ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_2, 5, in[ 6], 4 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_2, 8, in[ 4], 4 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_2, 6, in[15], 4 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_2, 6, in[ 5], 4 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_2, 9, in[12], 4 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_2, 9, in[13], 4 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_2, 5, in[ 7], 4 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_2, 8, in[14], 4 );
|
||||
|
||||
ROUND2_16W( B, C, D, E, A, F16W_1, 8, in[12], 5 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[15], 5 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_1, 9, in[ 4], 5 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_1, 5, in[ 5], 5 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_1, 6, in[ 7], 5 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_1, 8, in[ 6], 5 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_1, 6, in[13], 5 );
|
||||
ROUND2_16W( A, B, C, D, E, F16W_1, 5, in[14], 5 );
|
||||
ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
|
||||
ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
|
||||
ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
|
||||
ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
|
||||
|
||||
tmp = _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
|
||||
h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
|
||||
h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
|
||||
h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
|
||||
h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
|
||||
h[0] = tmp;
|
||||
}
|
||||
|
||||
void ripemd160_16way_init( ripemd160_16way_context *sc )
|
||||
{
|
||||
sc->val[0] = m512_const1_64( 0x6745230167452301 );
|
||||
sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
|
||||
sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
|
||||
sc->val[3] = m512_const1_64( 0x1032547610325476 );
|
||||
sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
|
||||
sc->count_high = sc->count_low = 0;
|
||||
}
|
||||
|
||||
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
size_t ptr;
|
||||
const int block_size = 64;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (block_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = block_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
|
||||
vdata = vdata + (clen>>2);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == block_size )
|
||||
{
|
||||
ripemd160_16way_round( sc );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr, u;
|
||||
uint32_t low, high;
|
||||
const int block_size = 64;
|
||||
const int pad = block_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & ( block_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
||||
ripemd160_16way_round( sc );
|
||||
memset_zero_512( sc->buf, pad>>2 );
|
||||
}
|
||||
else
|
||||
memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
|
||||
|
||||
low = sc->count_low;
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
|
||||
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
|
||||
ripemd160_16way_round( sc );
|
||||
for (u = 0; u < 5; u ++)
|
||||
casti_m512i( dst, u ) = sc->val[u];
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
@@ -16,8 +16,7 @@ typedef struct
|
||||
} __attribute__ ((aligned (64))) ripemd160_4way_context;
|
||||
|
||||
void ripemd160_4way_init( ripemd160_4way_context *sc );
|
||||
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
|
||||
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -27,28 +26,13 @@ typedef struct
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[5];
|
||||
uint32_t count_high, count_low;
|
||||
} __attribute__ ((aligned (128))) ripemd160_8way_context;
|
||||
} __attribute__ ((aligned (64))) ripemd160_8way_context;
|
||||
|
||||
void ripemd160_8way_init( ripemd160_8way_context *sc );
|
||||
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
|
||||
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[5];
|
||||
uint32_t count_high, count_low;
|
||||
} __attribute__ ((aligned (128))) ripemd160_16way_context;
|
||||
|
||||
void ripemd160_16way_init( ripemd160_16way_context *sc );
|
||||
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // __AVX2__
|
||||
#endif // __SSE4_2__
|
||||
#endif // RIPEMD_HASH_4WAY_H__
|
||||
|
@@ -41,9 +41,13 @@
|
||||
#define SHA2_HASH_4WAY_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
//#if defined(__SSE4_2__)
|
||||
|
||||
//#define SPH_SIZE_sha256 256
|
||||
|
||||
// SHA-256 4 way
|
||||
|
||||
@@ -52,15 +56,12 @@ typedef struct {
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
} sha256_4way_context;
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
@@ -70,32 +71,13 @@ typedef struct {
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
} sha256_8way_context;
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
bool initialized;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
//#define SPH_SIZE_sha512 512
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
@@ -104,31 +86,30 @@ typedef struct {
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_4way_context __attribute__ ((aligned (128)));
|
||||
} sha512_4way_context;
|
||||
|
||||
void sha512_4way_init( sha512_4way_context *sc);
|
||||
void sha512_4way_update( sha512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
// SHA-256 11 way hybrid
|
||||
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
|
||||
typedef struct {
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_8way_context __attribute__ ((aligned (128)));
|
||||
__m256i bufx[64>>2];
|
||||
__m256i valx[8];
|
||||
__m64 bufy[64>>2];
|
||||
__m64 valy[8];
|
||||
uint32_t bufz[64>>2];
|
||||
uint32_t valz[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_11way_context;
|
||||
|
||||
void sha512_8way_init( sha512_8way_context *sc);
|
||||
void sha512_8way_update( sha512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_8way_close( sha512_8way_context *sc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
void sha256_11way_init( sha256_11way_context *ctx );
|
||||
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
|
||||
const void *datay, const void *dataz, size_t len );
|
||||
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
|
||||
void *dstz );
|
||||
|
||||
#endif // __AVX2__
|
||||
#endif // __SSE2__
|
||||
#endif // SHA256_4WAY_H__
|
||||
|
@@ -39,31 +39,47 @@
|
||||
// SHA-256 32 bit
|
||||
|
||||
/*
|
||||
static const uint32_t H256[8] =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
static const sph_u32 H256[8] = {
|
||||
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
|
||||
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
|
||||
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
|
||||
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint32_t K256[64] =
|
||||
{
|
||||
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
|
||||
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
|
||||
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
|
||||
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
|
||||
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
|
||||
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
|
||||
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
|
||||
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
|
||||
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
|
||||
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
|
||||
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
|
||||
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
|
||||
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
|
||||
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
|
||||
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
|
||||
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||
static const sph_u32 K256[64] = {
|
||||
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
|
||||
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
|
||||
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
|
||||
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
|
||||
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
|
||||
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
|
||||
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
|
||||
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
|
||||
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
|
||||
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
|
||||
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
|
||||
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
|
||||
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
|
||||
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
|
||||
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
|
||||
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
|
||||
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
|
||||
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
|
||||
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
|
||||
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
|
||||
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
|
||||
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
|
||||
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
|
||||
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
|
||||
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
|
||||
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
|
||||
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
|
||||
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
|
||||
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
|
||||
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
|
||||
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
|
||||
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
|
||||
};
|
||||
|
||||
// SHA-256 4 way
|
||||
@@ -232,7 +248,7 @@ void sha256_4way_init( sha256_4way_context *sc )
|
||||
*/
|
||||
}
|
||||
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
size_t ptr;
|
||||
@@ -257,7 +273,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
clow2 = SPH_T32( clow + clen );
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
@@ -290,8 +306,10 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm128_bswap_32( m128_const1_32( high ) );
|
||||
// mm128_bswap_32( _mm_set1_epi32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm128_bswap_32( m128_const1_32( low ) );
|
||||
// mm128_bswap_32( _mm_set1_epi32( low ) );
|
||||
sha256_4way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm128_block_bswap_32( dst, sc->val );
|
||||
@@ -465,7 +483,7 @@ void sha256_8way_init( sha256_8way_context *sc )
|
||||
*/
|
||||
}
|
||||
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
||||
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
@@ -490,7 +508,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
clow2 = SPH_T32( clow + clen );
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
@@ -531,233 +549,5 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
mm256_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
#define CHx16(X, Y, Z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJx16(X, Y, Z) \
|
||||
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
||||
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
||||
|
||||
#define BSG2_0x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
|
||||
|
||||
#define BSG2_1x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
|
||||
|
||||
#define SSG2_0x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) )
|
||||
|
||||
#define SSG2_1x16(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
|
||||
|
||||
#define SHA2x16_MEXP( a, b, c, d ) \
|
||||
mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
|
||||
|
||||
#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
|
||||
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
{
|
||||
register __m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
mm512_block_bswap_32( W , in );
|
||||
mm512_block_bswap_32( W+8, in+8 );
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
C = r[2];
|
||||
D = r[3];
|
||||
E = r[4];
|
||||
F = r[5];
|
||||
G = r[6];
|
||||
H = r[7];
|
||||
}
|
||||
else
|
||||
{
|
||||
A = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
B = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
C = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
D = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
E = m512_const1_64( 0x510E527F510E527F );
|
||||
F = m512_const1_64( 0x9B05688C9B05688C );
|
||||
G = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
H = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
}
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
for ( int j = 16; j < 64; j += 16 )
|
||||
{
|
||||
W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 );
|
||||
W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 );
|
||||
W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 );
|
||||
W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 );
|
||||
W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 );
|
||||
W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 );
|
||||
W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 );
|
||||
W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 );
|
||||
W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 );
|
||||
W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 );
|
||||
W[10] = SHA2x16_MEXP( 8, 3, 11, 10 );
|
||||
W[11] = SHA2x16_MEXP( 9, 4, 12, 11 );
|
||||
W[12] = SHA2x16_MEXP( 10, 5, 13, 12 );
|
||||
W[13] = SHA2x16_MEXP( 11, 6, 14, 13 );
|
||||
W[14] = SHA2x16_MEXP( 12, 7, 15, 14 );
|
||||
W[15] = SHA2x16_MEXP( 13, 8, 0, 15 );
|
||||
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
|
||||
SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
|
||||
SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
|
||||
SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
|
||||
SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
|
||||
SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
|
||||
SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
|
||||
SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
|
||||
SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
|
||||
}
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
r[0] = _mm512_add_epi32( r[0], A );
|
||||
r[1] = _mm512_add_epi32( r[1], B );
|
||||
r[2] = _mm512_add_epi32( r[2], C );
|
||||
r[3] = _mm512_add_epi32( r[3], D );
|
||||
r[4] = _mm512_add_epi32( r[4], E );
|
||||
r[5] = _mm512_add_epi32( r[5], F );
|
||||
r[6] = _mm512_add_epi32( r[6], G );
|
||||
r[7] = _mm512_add_epi32( r[7], H );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
|
||||
r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
|
||||
r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
|
||||
r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
|
||||
r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
|
||||
r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
|
||||
r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
|
||||
r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
{
|
||||
sc->initialized = false;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
}
|
||||
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
size_t ptr;
|
||||
const int buf_size = 64;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
|
||||
vdata = vdata + (clen>>2);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_16way_round( sc, sc->buf, sc->val );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high++;
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr;
|
||||
uint32_t low, high;
|
||||
const int buf_size = 64;
|
||||
const int pad = buf_size - 8;
|
||||
|
||||
ptr = (unsigned)sc->count_low & (buf_size - 1U);
|
||||
sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
sha256_16way_round( sc, sc->buf, sc->val );
|
||||
memset_zero_512( sc->buf, pad >> 2 );
|
||||
}
|
||||
else
|
||||
memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
|
||||
|
||||
low = sc->count_low;
|
||||
high = (sc->count_high << 3) | (low >> 29);
|
||||
low = low << 3;
|
||||
|
||||
sc->buf[ pad >> 2 ] =
|
||||
mm512_bswap_32( m512_const1_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] =
|
||||
mm512_bswap_32( m512_const1_32( low ) );
|
||||
|
||||
sha256_16way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
#endif // __AVX2__
|
||||
#endif // __SSE2__
|
||||
|
@@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input )
|
||||
sha256_8way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
||||
|
||||
sha256_8way_update( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
|
||||
// Need big endian data
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way_update( &sha256_ctx8, vdata, 64 );
|
||||
sha256_8way( &sha256_ctx8, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
@@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input )
|
||||
sha256_4way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
||||
|
||||
sha256_4way_update( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
}
|
||||
|
||||
@@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way_update( &sha256_ctx4, vdata, 64 );
|
||||
sha256_4way( &sha256_ctx4, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
|
@@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input )
|
||||
sha256_8way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
||||
|
||||
sha256_8way_update( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
// Need big endian data
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way_update( &sha256_ctx8, vdata, 64 );
|
||||
sha256_8way( &sha256_ctx8, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
@@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input )
|
||||
sha256_4way_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
||||
|
||||
sha256_4way_update( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
}
|
||||
|
||||
@@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way_update( &sha256_ctx4, vdata, 64 );
|
||||
sha256_4way( &sha256_ctx4, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
|
@@ -36,290 +36,60 @@
|
||||
#include <string.h>
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
/*
|
||||
static const uit64_t H512[8] =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
static const sph_u64 H512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
*/
|
||||
|
||||
static const uint64_t K512[80] =
|
||||
{
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD,
|
||||
0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019,
|
||||
0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE,
|
||||
0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
|
||||
0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
|
||||
0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
|
||||
0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210,
|
||||
0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
|
||||
0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926,
|
||||
0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
|
||||
0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001,
|
||||
0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910,
|
||||
0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
|
||||
0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
|
||||
0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
|
||||
0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
|
||||
0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207,
|
||||
0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
|
||||
0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493,
|
||||
0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
|
||||
0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
static const sph_u64 K512[80] = {
|
||||
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
|
||||
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
|
||||
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
|
||||
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
|
||||
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
|
||||
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
|
||||
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
|
||||
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
|
||||
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
|
||||
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
|
||||
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
|
||||
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
|
||||
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
|
||||
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
|
||||
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
|
||||
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
|
||||
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
|
||||
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
|
||||
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
|
||||
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
|
||||
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
|
||||
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
|
||||
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
|
||||
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
|
||||
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
|
||||
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
|
||||
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
|
||||
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
|
||||
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
|
||||
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
|
||||
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
|
||||
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
|
||||
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
|
||||
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
|
||||
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
|
||||
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
|
||||
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
|
||||
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
|
||||
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
|
||||
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way 64 bit
|
||||
|
||||
#define CH8W(X, Y, Z) \
|
||||
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
||||
|
||||
#define MAJ8W(X, Y, Z) \
|
||||
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
||||
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
||||
|
||||
#define BSG8W_5_0(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
|
||||
|
||||
#define BSG8W_5_1(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
|
||||
|
||||
#define SSG8W_5_0(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
|
||||
|
||||
#define SSG8W_5_1(x) \
|
||||
_mm512_xor_si512( _mm512_xor_si512( \
|
||||
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
|
||||
|
||||
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
|
||||
{
|
||||
__m512i w0a, w1a, w0b, w1b;
|
||||
w0a = mm512_ror_64( w0, 1 );
|
||||
w1a = mm512_ror_64( w1,19 );
|
||||
w0b = mm512_ror_64( w0, 8 );
|
||||
w1b = mm512_ror_64( w1,61 );
|
||||
w0a = _mm512_xor_si512( w0a, w0b );
|
||||
w1a = _mm512_xor_si512( w1a, w1b );
|
||||
w0b = _mm512_srli_epi64( w0, 7 );
|
||||
w1b = _mm512_srli_epi64( w1, 6 );
|
||||
w0a = _mm512_xor_si512( w0a, w0b );
|
||||
w1a = _mm512_xor_si512( w1a, w1b );
|
||||
return _mm512_add_epi64( w0a, w1a );
|
||||
}
|
||||
|
||||
|
||||
#define SSG8W_512x2_0( w0, w1, i ) do \
|
||||
{ \
|
||||
__m512i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm512_ror_64( W[i-15], 1 ); \
|
||||
X1a = mm512_ror_64( W[i-14], 1 ); \
|
||||
X0b = mm512_ror_64( W[i-15], 8 ); \
|
||||
X1b = mm512_ror_64( W[i-14], 8 ); \
|
||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
|
||||
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
|
||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||
} while(0)
|
||||
|
||||
#define SSG8W_512x2_1( w0, w1, i ) do \
|
||||
{ \
|
||||
__m512i X0a, X1a, X0b, X1b; \
|
||||
X0a = mm512_ror_64( W[i-2],19 ); \
|
||||
X1a = mm512_ror_64( W[i-1],19 ); \
|
||||
X0b = mm512_ror_64( W[i-2],61 ); \
|
||||
X1b = mm512_ror_64( W[i-1],61 ); \
|
||||
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
|
||||
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
|
||||
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||
} while(0)
|
||||
|
||||
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||
do { \
|
||||
__m512i T1, T2; \
|
||||
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
|
||||
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
|
||||
K, W[i] ) ); \
|
||||
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
|
||||
D = _mm512_add_epi64( D, T1 ); \
|
||||
H = _mm512_add_epi64( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||
{
|
||||
int i;
|
||||
register __m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[80];
|
||||
|
||||
mm512_block_bswap_64( W , in );
|
||||
mm512_block_bswap_64( W+8, in+8 );
|
||||
|
||||
for ( i = 16; i < 80; i++ )
|
||||
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
|
||||
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
A = r[0];
|
||||
B = r[1];
|
||||
C = r[2];
|
||||
D = r[3];
|
||||
E = r[4];
|
||||
F = r[5];
|
||||
G = r[6];
|
||||
H = r[7];
|
||||
}
|
||||
else
|
||||
{
|
||||
A = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
B = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
C = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
D = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
E = m512_const1_64( 0x510E527FADE682D1 );
|
||||
F = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
G = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
H = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < 80; i += 8 )
|
||||
{
|
||||
SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
|
||||
SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
|
||||
SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
|
||||
SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
|
||||
SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
|
||||
SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
|
||||
SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
|
||||
SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
|
||||
}
|
||||
|
||||
if ( ctx->initialized )
|
||||
{
|
||||
r[0] = _mm512_add_epi64( r[0], A );
|
||||
r[1] = _mm512_add_epi64( r[1], B );
|
||||
r[2] = _mm512_add_epi64( r[2], C );
|
||||
r[3] = _mm512_add_epi64( r[3], D );
|
||||
r[4] = _mm512_add_epi64( r[4], E );
|
||||
r[5] = _mm512_add_epi64( r[5], F );
|
||||
r[6] = _mm512_add_epi64( r[6], G );
|
||||
r[7] = _mm512_add_epi64( r[7], H );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->initialized = true;
|
||||
r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
|
||||
r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
|
||||
r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
|
||||
r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
|
||||
r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
|
||||
r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
|
||||
r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
|
||||
r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
|
||||
}
|
||||
}
|
||||
|
||||
void sha512_8way_init( sha512_8way_context *sc )
|
||||
{
|
||||
sc->initialized = false;
|
||||
sc->count = 0;
|
||||
}
|
||||
|
||||
void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
size_t ptr;
|
||||
const int buf_size = 128;
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
ptr += clen;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha512_8way_round( sc, sc->buf, sc->val );
|
||||
ptr = 0;
|
||||
}
|
||||
sc->count += clen;
|
||||
}
|
||||
}
|
||||
|
||||
void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m512i shuff_bswap64 = m512_const_64(
|
||||
0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
ptr += 8;
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
sha512_8way_round( sc, sc->buf, sc->val );
|
||||
memset_zero_512( sc->buf, pad >> 3 );
|
||||
}
|
||||
else
|
||||
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
|
||||
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
|
||||
sha512_8way_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_64( dst, sc->val );
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// SHA-512 4 way 64 bit
|
||||
|
||||
|
||||
#define CH(X, Y, Z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||
|
||||
@@ -484,7 +254,7 @@ void sha512_4way_init( sha512_4way_context *sc )
|
||||
sc->count = 0;
|
||||
}
|
||||
|
||||
void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
|
||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
|
@@ -97,7 +97,7 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
||||
void shabal512_4way_init( void *cc );
|
||||
void shabal512_4way_update( void *cc, const void *data, size_t len );
|
||||
//#define shabal512_4way shabal512_4way_update
|
||||
#define shabal512_4way shabal512_4way_update
|
||||
void shabal512_4way_close( void *cc, void *dst );
|
||||
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
@@ -3,12 +3,6 @@
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
// This implementation is deprecated, superseded by VAES in Icelake
|
||||
// which provides HW based 4 way aes.
|
||||
// It was created for AVX2 to eliminate interleaving between the
|
||||
// preceding and following function.
|
||||
// This code can be removed when current users have reverted to one way.
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
|
||||
|
@@ -1,399 +0,0 @@
|
||||
#include "shavite-hash-4way.h"
|
||||
#include <stdint.h>
|
||||
|
||||
static const uint32_t IV512[] =
|
||||
{
|
||||
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
|
||||
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
|
||||
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
|
||||
mm512_ror128_32( b ) )
|
||||
|
||||
static void
|
||||
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
||||
{
|
||||
register __m512i X;
|
||||
register __m512i P0, P1, P2, P3;
|
||||
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
||||
__m512i *M = (__m512i*)msg;
|
||||
__m512i *H = (__m512i*)ctx->h;
|
||||
int r;
|
||||
|
||||
P0 = H[0];
|
||||
P1 = H[1];
|
||||
P2 = H[2];
|
||||
P3 = H[3];
|
||||
|
||||
K0 = M[0];
|
||||
K1 = M[1];
|
||||
K2 = M[2];
|
||||
K3 = M[3];
|
||||
K4 = M[4];
|
||||
K5 = M[5];
|
||||
K6 = M[6];
|
||||
K7 = M[7];
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P0 = _mm512_xor_si512( P0, X );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P2 = _mm512_xor_si512( P2, X );
|
||||
|
||||
// round
|
||||
for ( r = 0; r < 3; r ++ )
|
||||
{
|
||||
// round 1, 5, 9
|
||||
|
||||
K0 = _mm512_xor_si512( K7, mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
||||
|
||||
if ( r == 0 )
|
||||
K0 = _mm512_xor_si512( K0, _mm512_set4_epi32(
|
||||
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K0,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
||||
|
||||
if ( r == 1 )
|
||||
K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
|
||||
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K1,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K2,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K3,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K4,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K5,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K6,
|
||||
mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
||||
|
||||
if ( r == 2 )
|
||||
K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
|
||||
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P2 = _mm512_xor_si512( P2, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P0 = _mm512_xor_si512( P0, X );
|
||||
|
||||
// round 3, 7, 11
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P0 = _mm512_xor_si512( P0, X );
|
||||
|
||||
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P2 = _mm512_xor_si512( P2, X );
|
||||
}
|
||||
|
||||
// round 13
|
||||
|
||||
K0 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
||||
K1 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
||||
K2 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
||||
K3 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
||||
|
||||
P3 = _mm512_xor_si512( P3, X );
|
||||
|
||||
K4 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
||||
K5 = _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
||||
|
||||
K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
||||
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
||||
K7= _mm512_xor_si512( mm512_ror128_32(
|
||||
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
||||
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
||||
|
||||
P1 = _mm512_xor_si512( P1, X );
|
||||
|
||||
H[0] = _mm512_xor_si512( H[0], P2 );
|
||||
H[1] = _mm512_xor_si512( H[1], P3 );
|
||||
H[2] = _mm512_xor_si512( H[2], P0 );
|
||||
H[3] = _mm512_xor_si512( H[3], P1 );
|
||||
}
|
||||
|
||||
void shavite512_4way_init( shavite512_4way_context *ctx )
|
||||
{
|
||||
__m512i *h = (__m512i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
|
||||
h[0] = m512_const1_128( iv[0] );
|
||||
h[1] = m512_const1_128( iv[1] );
|
||||
h[2] = m512_const1_128( iv[2] );
|
||||
h[3] = m512_const1_128( iv[3] );
|
||||
|
||||
ctx->ptr = 0;
|
||||
ctx->count0 = 0;
|
||||
ctx->count1 = 0;
|
||||
ctx->count2 = 0;
|
||||
ctx->count3 = 0;
|
||||
}
|
||||
|
||||
// not tested, use update_close
|
||||
void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
unsigned char *buf = ctx->buf;
|
||||
size_t ptr = ctx->ptr;
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof ctx->buf) - ptr;
|
||||
if ( clen > len << 2 )
|
||||
clen = len << 2;
|
||||
memcpy( buf + ptr, data, clen );
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= clen >> 2;
|
||||
if ( ptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
||||
{
|
||||
ctx->count1 = ctx->count1 + 1;
|
||||
if ( ctx->count1 == 0 )
|
||||
{
|
||||
ctx->count2 = ctx->count2 + 1;
|
||||
if ( ctx->count2 == 0 )
|
||||
ctx->count3 = ctx->count3 + 1;
|
||||
}
|
||||
}
|
||||
c512_4way( ctx, buf );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
ctx->ptr = ptr;
|
||||
}
|
||||
|
||||
// not tested
|
||||
void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
||||
{
|
||||
unsigned char *buf;
|
||||
union
|
||||
{
|
||||
uint32_t u32[4];
|
||||
uint16_t u16[8];
|
||||
} count;
|
||||
|
||||
buf = ctx->buf;
|
||||
uint32_t vp = ctx->ptr>>6;
|
||||
|
||||
// Terminating byte then zero pad
|
||||
casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
|
||||
|
||||
// Zero pad full vectors up to count
|
||||
for ( ; vp < 6; vp++ )
|
||||
casti_m512i( buf, vp ) = m512_zero;
|
||||
|
||||
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
||||
// Count is misaligned to 16 bits and straddles a vector.
|
||||
// Use u32 overlay to stage then u16 to load buf.
|
||||
count.u32[0] = ctx->count0 += (ctx->ptr << 1); // ptr/4 * 8
|
||||
count.u32[1] = ctx->count1;
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m512i( buf, 6 ) = m512_const1_128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
c512_4way( ctx, buf);
|
||||
|
||||
casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
|
||||
casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
|
||||
casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
|
||||
casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
|
||||
}
|
||||
|
||||
void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
unsigned char *buf = ctx->buf;
|
||||
size_t ptr = ctx->ptr;
|
||||
|
||||
// process full blocks and load buf with remainder.
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof ctx->buf) - ptr;
|
||||
if ( clen > len << 2 )
|
||||
clen = len << 2;
|
||||
memcpy( buf + ptr, data, clen );
|
||||
data = (const unsigned char *)data + clen;
|
||||
ptr += clen;
|
||||
len -= (clen >> 2);
|
||||
if ( ptr == sizeof ctx->buf )
|
||||
{
|
||||
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
||||
{
|
||||
ctx->count1 = ctx->count1 + 1;
|
||||
if ( ctx->count1 == 0 )
|
||||
{
|
||||
ctx->count2 = ctx->count2 + 1;
|
||||
if ( ctx->count2 == 0 )
|
||||
ctx->count3 = ctx->count3 + 1;
|
||||
}
|
||||
}
|
||||
c512_4way( ctx, buf );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t vp = ptr>>6;
|
||||
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
||||
// Count is misaligned to 16 bits and straddles 2 vectors.
|
||||
// Use u32 overlay to stage then u16 to load buf.
|
||||
union
|
||||
{
|
||||
uint32_t u32[4];
|
||||
uint16_t u16[8];
|
||||
} count;
|
||||
|
||||
count.u32[0] = ctx->count0 += (ptr << 1); // ptr/4 * 8
|
||||
count.u32[1] = ctx->count1;
|
||||
count.u32[2] = ctx->count2;
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
if ( vp == 0 ) // empty buf, xevan.
|
||||
{
|
||||
casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + 1, 5 );
|
||||
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
||||
}
|
||||
else // half full buf, everyone else.
|
||||
{
|
||||
casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
|
||||
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = m512_const1_128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
c512_4way( ctx, buf);
|
||||
|
||||
casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
|
||||
casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
|
||||
casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
|
||||
casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
|
||||
}
|
||||
|
||||
#endif // VAES
|
@@ -1,25 +0,0 @@
|
||||
#ifndef SHAVITE_HASH_4WAY_H__
|
||||
#define SHAVITE_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128<<2];
|
||||
uint32_t h[16<<2];
|
||||
size_t ptr;
|
||||
uint32_t count0, count1, count2, count3;
|
||||
} shavite512_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void shavite512_4way_init( shavite512_4way_context *ctx );
|
||||
void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void shavite512_4way_close( shavite512_4way_context *ctx, void *dst );
|
||||
void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#endif // VAES
|
||||
|
||||
#endif // SHAVITE_HASH_4WAY_H__
|
||||
|
@@ -100,20 +100,9 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
p3 = h[3];
|
||||
|
||||
// round
|
||||
|
||||
// working proof of concept
|
||||
/*
|
||||
__m512i K = m512_const1_128( m[0] );
|
||||
__m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
|
||||
X = _mm512_aesenc_epi128( X, m512_zero );
|
||||
k00 = _mm512_castsi512_si128( K );
|
||||
x = _mm512_castsi512_si128( X );
|
||||
*/
|
||||
|
||||
k00 = m[0];
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
||||
k01 = m[1];
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, zero );
|
||||
|
@@ -18,18 +18,76 @@ void skeinhash_8way( void *state, const void *input )
|
||||
uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
|
||||
skein512_8way_context ctx_skein;
|
||||
|
||||
//#if defined(__SHA__)
|
||||
// uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash4[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash5[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash6[16] __attribute__ ((aligned (64)));
|
||||
// uint32_t hash7[16] __attribute__ ((aligned (64)));
|
||||
// SHA256_CTX ctx_sha256;
|
||||
//#else
|
||||
uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
|
||||
sha256_8way_context ctx_sha256;
|
||||
//#endif
|
||||
|
||||
skein512_8way_init( &ctx_skein );
|
||||
skein512_8way_update( &ctx_skein, input, 80 );
|
||||
skein512_8way_close( &ctx_skein, vhash64 );
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash64, 512 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
|
||||
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
|
||||
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
|
||||
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
|
||||
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
|
||||
SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
|
||||
SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
|
||||
SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
|
||||
|
||||
SHA256_Init( &ctx_sha256 );
|
||||
SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
|
||||
SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
|
||||
|
||||
intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
#else
|
||||
*/
|
||||
|
||||
rintrlv_8x64_8x32( vhash32, vhash64, 512 );
|
||||
// dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
// vhash64, 512 );
|
||||
// intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
// hash7, 512 );
|
||||
|
||||
sha256_8way_init( &ctx_sha256 );
|
||||
sha256_8way_update( &ctx_sha256, vhash32, 64 );
|
||||
sha256_8way( &ctx_sha256, vhash32, 64 );
|
||||
sha256_8way_close( &ctx_sha256, state );
|
||||
//#endif
|
||||
}
|
||||
|
||||
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -118,7 +176,7 @@ void skeinhash_4way( void *state, const void *input )
|
||||
rintrlv_4x64_4x32( vhash32, vhash64, 512 );
|
||||
|
||||
sha256_4way_init( &ctx_sha256 );
|
||||
sha256_4way_update( &ctx_sha256, vhash32, 64 );
|
||||
sha256_4way( &ctx_sha256, vhash32, 64 );
|
||||
sha256_4way_close( &ctx_sha256, state );
|
||||
#endif
|
||||
}
|
||||
|
@@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context;
|
||||
void skein512_4way_init( skein512_4way_context *sc );
|
||||
void skein512_4way_update( void *cc, const void *data, size_t len );
|
||||
void skein512_4way_close( void *cc, void *dst );
|
||||
//#define skein512_4way skein512_4way_update
|
||||
#define skein512_4way skein512_4way_update
|
||||
|
||||
void skein256_4way_init( skein256_4way_context *sc );
|
||||
void skein256_4way_update( void *cc, const void *data, size_t len );
|
||||
void skein256_4way_close( void *cc, void *dst );
|
||||
//#define skein256_4way skein256_4way_update
|
||||
#define skein256_4way skein256_4way_update
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input )
|
||||
uint64_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
|
||||
skein512_4way_init( &ctx );
|
||||
skein512_4way_update( &ctx, input, 80 );
|
||||
skein512_4way( &ctx, input, 80 );
|
||||
skein512_4way_close( &ctx, hash );
|
||||
|
||||
skein512_4way_init( &ctx );
|
||||
skein512_4way_update( &ctx, hash, 64 );
|
||||
skein512_4way( &ctx, hash, 64 );
|
||||
skein512_4way_close( &ctx, output );
|
||||
}
|
||||
|
||||
|
@@ -50,138 +50,41 @@
|
||||
#include <string.h>
|
||||
#include "sm3-hash-4way.h"
|
||||
|
||||
#ifdef __AVX2__
|
||||
#ifdef __SSE4_2__
|
||||
|
||||
#define P0_8W(x) \
|
||||
_mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 9 ), \
|
||||
mm256_rol_32( x, 17 ) ) )
|
||||
|
||||
#define P1_8W(x) \
|
||||
_mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \
|
||||
mm256_rol_32( x, 23 ) ) )
|
||||
|
||||
#define FF0_8W(x,y,z) \
|
||||
_mm256_xor_si256( x, _mm256_xor_si256( y, z ) )
|
||||
|
||||
#define FF1_8W(x,y,z) \
|
||||
_mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \
|
||||
_mm256_and_si256( x, z ) ), \
|
||||
_mm256_and_si256( y, z ) )
|
||||
|
||||
#define GG0_8W(x,y,z) FF0_8W(x,y,z)
|
||||
|
||||
#define GG1_8W(x,y,z) \
|
||||
_mm256_or_si256( _mm256_and_si256( x, y ), \
|
||||
_mm256_andnot_si256( x, z ) )
|
||||
|
||||
void sm3_8way_compress( __m256i *digest, __m256i *block )
|
||||
void sm3_4way_init( sm3_4way_ctx_t *ctx )
|
||||
{
|
||||
__m256i W[68], W1[64];
|
||||
__m256i A = digest[ 0 ];
|
||||
__m256i B = digest[ 1 ];
|
||||
__m256i C = digest[ 2 ];
|
||||
__m256i D = digest[ 3 ];
|
||||
__m256i E = digest[ 4 ];
|
||||
__m256i F = digest[ 5 ];
|
||||
__m256i G = digest[ 6 ];
|
||||
__m256i H = digest[ 7 ];
|
||||
__m256i SS1, SS2, TT1, TT2, T;
|
||||
int j;
|
||||
|
||||
for ( j = 0; j < 16; j++ )
|
||||
W[j] = mm256_bswap_32( block[j] );
|
||||
|
||||
for ( j = 16; j < 68; j++ )
|
||||
W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256(
|
||||
_mm256_xor_si256( W[ j-16 ], W[ j-9 ] ),
|
||||
mm256_rol_32( W[ j-3 ], 15 ) ) ),
|
||||
_mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) );
|
||||
|
||||
for( j = 0; j < 64; j++ )
|
||||
W1[j] = _mm256_xor_si256( W[j], W[j+4] );
|
||||
|
||||
T = _mm256_set1_epi32( 0x79CC4519UL );
|
||||
for( j =0; j < 16; j++ )
|
||||
{
|
||||
SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32(
|
||||
mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 );
|
||||
SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
|
||||
TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
|
||||
FF0_8W( A, B, C ), D ), SS2 ), W1[j] );
|
||||
TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
|
||||
GG0_8W( E, F, G ), H ), SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm256_rol_32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm256_rol_32( F, 19 );
|
||||
F = E;
|
||||
E = P0_8W( TT2 );
|
||||
}
|
||||
|
||||
T = _mm256_set1_epi32( 0x7A879D8AUL );
|
||||
for( j =16; j < 64; j++ )
|
||||
{
|
||||
SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32(
|
||||
mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 );
|
||||
SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
|
||||
TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
|
||||
FF1_8W( A, B, C ), D ), SS2 ), W1[j] );
|
||||
TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
|
||||
GG1_8W( E, F, G ), H ), SS1 ), W[j] );
|
||||
D = C;
|
||||
C = mm256_rol_32( B, 9 );
|
||||
B = A;
|
||||
A = TT1;
|
||||
H = G;
|
||||
G = mm256_rol_32( F, 19 );
|
||||
F = E;
|
||||
E = P0_8W( TT2 );
|
||||
}
|
||||
|
||||
digest[0] = _mm256_xor_si256( digest[0], A );
|
||||
digest[1] = _mm256_xor_si256( digest[1], B );
|
||||
digest[2] = _mm256_xor_si256( digest[2], C );
|
||||
digest[3] = _mm256_xor_si256( digest[3], D );
|
||||
digest[4] = _mm256_xor_si256( digest[4], E );
|
||||
digest[5] = _mm256_xor_si256( digest[5], F );
|
||||
digest[6] = _mm256_xor_si256( digest[6], G );
|
||||
digest[7] = _mm256_xor_si256( digest[7], H );
|
||||
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
|
||||
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
|
||||
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
|
||||
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
|
||||
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
|
||||
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
|
||||
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
|
||||
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
|
||||
ctx->nblocks = 0;
|
||||
ctx->num = 0;
|
||||
}
|
||||
|
||||
void sm3_8way_init( sm3_8way_ctx_t *ctx )
|
||||
void sm3_4way( void *cc, const void *data, size_t len )
|
||||
{
|
||||
ctx->digest[0] = _mm256_set1_epi32( 0x7380166F );
|
||||
ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 );
|
||||
ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 );
|
||||
ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 );
|
||||
ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC );
|
||||
ctx->digest[5] = _mm256_set1_epi32( 0x163138AA );
|
||||
ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D );
|
||||
ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E );
|
||||
ctx->nblocks = 0;
|
||||
ctx->num = 0;
|
||||
}
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
|
||||
void sm3_8way_update( void *cc, const void *data, size_t len )
|
||||
{
|
||||
sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
|
||||
__m256i *block = (__m256i*)ctx->block;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
if ( ctx->num )
|
||||
{
|
||||
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
|
||||
if ( len < left )
|
||||
{
|
||||
memcpy_256( block + (ctx->num >> 2), vdata , len>>2 );
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
|
||||
ctx->num += len;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( block + (ctx->num >> 2), vdata , left>>2 );
|
||||
sm3_8way_compress( ctx->digest, block );
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
ctx->nblocks++;
|
||||
vdata += left>>2;
|
||||
len -= left;
|
||||
@@ -189,53 +92,49 @@ void sm3_8way_update( void *cc, const void *data, size_t len )
|
||||
}
|
||||
while ( len >= SM3_BLOCK_SIZE )
|
||||
{
|
||||
sm3_8way_compress( ctx->digest, vdata );
|
||||
sm3_4way_compress( ctx->digest, vdata );
|
||||
ctx->nblocks++;
|
||||
vdata += SM3_BLOCK_SIZE>>2;
|
||||
len -= SM3_BLOCK_SIZE;
|
||||
}
|
||||
ctx->num = len;
|
||||
if ( len )
|
||||
memcpy_256( block, vdata, len>>2 );
|
||||
memcpy_128( block, vdata, len>>2 );
|
||||
}
|
||||
|
||||
void sm3_8way_close( void *cc, void *dst )
|
||||
void sm3_4way_close( void *cc, void *dst )
|
||||
{
|
||||
sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
|
||||
__m256i *hash = (__m256i*)dst;
|
||||
__m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
|
||||
__m256i *block = (__m256i*)ctx->block;
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *hash = (__m128i*)dst;
|
||||
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
int i;
|
||||
|
||||
block[ctx->num] = _mm256_set1_epi32( 0x80 );
|
||||
block[ctx->num] = _mm_set1_epi32( 0x80 );
|
||||
|
||||
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
|
||||
{
|
||||
memset_zero_256( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( block + (ctx->num >> 2) + 1,
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
|
||||
sm3_8way_compress( ctx->digest, block );
|
||||
memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
}
|
||||
|
||||
count[0] = mm256_bswap_32(
|
||||
_mm256_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
count[0] = mm128_bswap_32(
|
||||
_mm_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
( ctx->num << 3 ) ) );
|
||||
sm3_8way_compress( ctx->digest, block );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
|
||||
for ( i = 0; i < 8 ; i++ )
|
||||
hash[i] = mm256_bswap_32( ctx->digest[i] );
|
||||
hash[i] = mm128_bswap_32( ctx->digest[i] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \
|
||||
mm128_rol_32( x, 17 ) ) )
|
||||
#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
|
||||
@@ -328,88 +227,5 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
|
||||
digest[7] = _mm_xor_si128( digest[7], H );
|
||||
}
|
||||
|
||||
void sm3_4way_init( sm3_4way_ctx_t *ctx )
|
||||
{
|
||||
ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
|
||||
ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
|
||||
ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
|
||||
ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
|
||||
ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
|
||||
ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
|
||||
ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
|
||||
ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
|
||||
ctx->nblocks = 0;
|
||||
ctx->num = 0;
|
||||
}
|
||||
|
||||
void sm3_4way_update( void *cc, const void *data, size_t len )
|
||||
{
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
|
||||
if ( ctx->num )
|
||||
{
|
||||
unsigned int left = SM3_BLOCK_SIZE - ctx->num;
|
||||
if ( len < left )
|
||||
{
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
|
||||
ctx->num += len;
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
ctx->nblocks++;
|
||||
vdata += left>>2;
|
||||
len -= left;
|
||||
}
|
||||
}
|
||||
while ( len >= SM3_BLOCK_SIZE )
|
||||
{
|
||||
sm3_4way_compress( ctx->digest, vdata );
|
||||
ctx->nblocks++;
|
||||
vdata += SM3_BLOCK_SIZE>>2;
|
||||
len -= SM3_BLOCK_SIZE;
|
||||
}
|
||||
ctx->num = len;
|
||||
if ( len )
|
||||
memcpy_128( block, vdata, len>>2 );
|
||||
}
|
||||
|
||||
void sm3_4way_close( void *cc, void *dst )
|
||||
{
|
||||
sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
|
||||
__m128i *hash = (__m128i*)dst;
|
||||
__m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
|
||||
__m128i *block = (__m128i*)ctx->block;
|
||||
int i;
|
||||
|
||||
block[ctx->num] = _mm_set1_epi32( 0x80 );
|
||||
|
||||
if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
|
||||
{
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_128( block + (ctx->num >> 2) + 1,
|
||||
( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
|
||||
}
|
||||
|
||||
count[0] = mm128_bswap_32(
|
||||
_mm_set1_epi32( ctx->nblocks >> 23 ) );
|
||||
count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
|
||||
( ctx->num << 3 ) ) );
|
||||
sm3_4way_compress( ctx->digest, block );
|
||||
|
||||
for ( i = 0; i < 8 ; i++ )
|
||||
hash[i] = mm128_bswap_32( ctx->digest[i] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -48,13 +48,14 @@
|
||||
*/
|
||||
|
||||
#ifndef SPH_SM3_HASH_4WAY_H
|
||||
#define SPH_SM3_HASH_4WAY_H 1
|
||||
#define SPH_SM3_HASH_4WAY_H
|
||||
|
||||
#define SM3_DIGEST_LENGTH 32
|
||||
#define SM3_BLOCK_SIZE 64
|
||||
#define SM3_CBLOCK (SM3_BLOCK_SIZE)
|
||||
#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
|
||||
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -64,6 +65,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct {
|
||||
__m128i block[16] __attribute__ ((aligned (64)));
|
||||
__m128i digest[8];
|
||||
@@ -72,24 +74,15 @@ typedef struct {
|
||||
} sm3_4way_ctx_t;
|
||||
|
||||
void sm3_4way_init( sm3_4way_ctx_t *ctx );
|
||||
void sm3_4way_update(void *cc, const void *data, size_t len);
|
||||
//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
|
||||
// size_t data_len );
|
||||
//void sm3_4way_final( sm3_4way_ctx_t *ctx,
|
||||
// unsigned char digest[SM3_DIGEST_LENGTH] );
|
||||
void sm3_4way_compress( __m128i *digest, __m128i *block );
|
||||
|
||||
void sm3_4way(void *cc, const void *data, size_t len);
|
||||
void sm3_4way_close(void *cc, void *dst);
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
__m256i block[16] __attribute__ ((aligned (64)));
|
||||
__m256i digest[8];
|
||||
uint32_t nblocks;
|
||||
uint32_t num;
|
||||
} sm3_8way_ctx_t;
|
||||
|
||||
void sm3_8way_init( sm3_8way_ctx_t *ctx );
|
||||
void sm3_8way_update(void *cc, const void *data, size_t len);
|
||||
void sm3_8way_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -14,32 +14,21 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined (C11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} c11_8way_ctx_holder;
|
||||
|
||||
c11_8way_ctx_holder c11_8way_ctx;
|
||||
@@ -48,28 +37,20 @@ void init_c11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &c11_8way_ctx.blake );
|
||||
bmw512_8way_init( &c11_8way_ctx.bmw );
|
||||
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &c11_8way_ctx.skein );
|
||||
jh512_8way_init( &c11_8way_ctx.jh );
|
||||
keccak512_8way_init( &c11_8way_ctx.keccak );
|
||||
luffa_4way_init( &c11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &c11_8way_ctx.shavite );
|
||||
echo_4way_init( &c11_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &c11_8way_ctx.shavite );
|
||||
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||
init_echo( &c11_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void c11_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -89,21 +70,11 @@ void c11_8way_hash( void *state, const void *input )
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
@@ -120,11 +91,10 @@ void c11_8way_hash( void *state, const void *input )
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
// 4 JH
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
@@ -137,27 +107,23 @@ void c11_8way_hash( void *state, const void *input )
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
// 7 Luffa + 8 cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
@@ -189,29 +155,16 @@ void c11_8way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
// 10 Simd
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
@@ -236,8 +189,6 @@ void c11_8way_hash( void *state, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
@@ -332,11 +283,11 @@ void c11_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
@@ -355,15 +306,15 @@ void c11_4way_hash( void *state, const void *input )
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 JH
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// Serial
|
||||
|
@@ -15,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input)
|
||||
switch ( permutation[i] )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way_update( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way_close( &ctx.blake, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
@@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input)
|
||||
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way_update( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhashB );
|
||||
if ( i == 7 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
|
@@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input)
|
||||
switch ( permutation[i] )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way_update( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way( &ctx.blake, vhashA, dataLen );
|
||||
blake512_4way_close( &ctx.blake, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way( &ctx.bmw, vhashA, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
@@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input)
|
||||
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way_update( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way( &ctx.skein, vhashA, dataLen );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way_update( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way( &ctx.jh, vhashA, dataLen );
|
||||
jh512_4way_close( &ctx.jh, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way( &ctx.keccak, vhashA, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhashB );
|
||||
if ( i == 9 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
|
||||
|
@@ -6,9 +6,6 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(TRIBUS_8WAY)
|
||||
|
||||
@@ -17,8 +14,6 @@ static __thread jh512_8way_context ctx_mid;
|
||||
void tribus_hash_8way( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -29,11 +24,7 @@ void tribus_hash_8way( void *state, const void *input )
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
jh512_8way_context ctx_jh;
|
||||
keccak512_8way_context ctx_keccak;
|
||||
#if defined(__VAES__)
|
||||
echo_4way_context ctx_echo;
|
||||
#else
|
||||
hashState_echo ctx_echo;
|
||||
#endif
|
||||
|
||||
memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
|
||||
jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
|
||||
@@ -43,23 +34,10 @@ void tribus_hash_8way( void *state, const void *input )
|
||||
keccak512_8way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx_keccak, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
echo_4way_init( &ctx_echo, 512 );
|
||||
echo_4way_update_close( &ctx_echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx_echo, 512 );
|
||||
echo_4way_update_close( &ctx_echo, vhashB, vhashB, 512 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
// hash echo serially
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
@@ -85,8 +63,6 @@ void tribus_hash_8way( void *state, const void *input )
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
bool register_tribus_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined (TRIBUS_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_tribus_8way;
|
||||
gate->hash = (void*)&tribus_hash_8way;
|
||||
|
@@ -14,32 +14,21 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined (X11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} x11_8way_ctx_holder;
|
||||
|
||||
x11_8way_ctx_holder x11_8way_ctx;
|
||||
@@ -48,28 +37,20 @@ void init_x11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x11_8way_ctx.blake );
|
||||
bmw512_8way_init( &x11_8way_ctx.bmw );
|
||||
init_groestl( &x11_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x11_8way_ctx.skein );
|
||||
jh512_8way_init( &x11_8way_ctx.jh );
|
||||
keccak512_8way_init( &x11_8way_ctx.keccak );
|
||||
luffa_4way_init( &x11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &x11_8way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &x11_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &x11_8way_ctx.shavite );
|
||||
echo_4way_init( &x11_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &x11_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &x11_8way_ctx.shavite );
|
||||
simd_4way_init( &x11_8way_ctx.simd, 512 );
|
||||
init_echo( &x11_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void x11_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -78,6 +59,7 @@ void x11_8way_hash( void *state, const void *input )
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
x11_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
@@ -86,18 +68,7 @@ void x11_8way_hash( void *state, const void *input )
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
@@ -124,11 +95,10 @@ void x11_8way_hash( void *state, const void *input )
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
@@ -138,26 +108,20 @@ void x11_8way_hash( void *state, const void *input )
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
@@ -190,28 +154,13 @@ void x11_8way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
@@ -237,8 +186,6 @@ void x11_8way_hash( void *state, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
@@ -335,11 +282,11 @@ void x11_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
@@ -358,15 +305,15 @@ void x11_4way_hash( void *state, const void *input )
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
@@ -15,7 +15,7 @@ bool register_x11_algo( algo_gate_t *gate )
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input )
|
||||
switch ( idx )
|
||||
{
|
||||
case 0:
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
|
||||
break;
|
||||
case 1:
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
if ( i >= len-1 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
|
||||
@@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input )
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 );
|
||||
break;
|
||||
case 3:
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
if ( i >= len-1 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
|
||||
break;
|
||||
case 4:
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
if ( i >= len-1 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
|
||||
break;
|
||||
case 5:
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
if ( i >= len-1 )
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
|
||||
|
@@ -15,33 +15,22 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined (X11GOST_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} x11gost_8way_ctx_holder;
|
||||
|
||||
x11gost_8way_ctx_holder x11gost_8way_ctx;
|
||||
@@ -50,29 +39,21 @@ void init_x11gost_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x11gost_8way_ctx.blake );
|
||||
bmw512_8way_init( &x11gost_8way_ctx.bmw );
|
||||
init_groestl( &x11gost_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x11gost_8way_ctx.skein );
|
||||
jh512_8way_init( &x11gost_8way_ctx.jh );
|
||||
keccak512_8way_init( &x11gost_8way_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_8way_ctx.gost );
|
||||
luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &x11gost_8way_ctx.simd, 512 );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &x11gost_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &x11gost_8way_ctx.shavite );
|
||||
echo_4way_init( &x11gost_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &x11gost_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &x11gost_8way_ctx.shavite );
|
||||
simd_4way_init( &x11gost_8way_ctx.simd, 512 );
|
||||
init_echo( &x11gost_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void x11gost_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -91,18 +72,7 @@ void x11gost_8way_hash( void *state, const void *input )
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
@@ -129,11 +99,10 @@ void x11gost_8way_hash( void *state, const void *input )
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
@@ -171,28 +140,20 @@ void x11gost_8way_hash( void *state, const void *input )
|
||||
sph_gost512( &ctx.gost, hash7, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
@@ -223,29 +184,14 @@ void x11gost_8way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
@@ -270,8 +216,6 @@ void x11gost_8way_hash( void *state, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
@@ -366,10 +310,10 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
x11gost_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
|
||||
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
@@ -389,13 +333,13 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
// 4way
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
|
@@ -16,11 +16,6 @@
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(X12_8WAY)
|
||||
|
||||
@@ -28,22 +23,16 @@
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
simd_4way_context simd;
|
||||
hamsi512_8way_context hamsi;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
hamsi512_8way_context hamsi;
|
||||
} x12_8way_ctx_holder;
|
||||
|
||||
x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
|
||||
@@ -52,29 +41,29 @@ void init_x12_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x12_8way_ctx.blake );
|
||||
bmw512_8way_init( &x12_8way_ctx.bmw );
|
||||
init_groestl( &x12_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x12_8way_ctx.skein );
|
||||
jh512_8way_init( &x12_8way_ctx.jh );
|
||||
keccak512_8way_init( &x12_8way_ctx.keccak );
|
||||
luffa_4way_init( &x12_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
|
||||
simd_4way_init( &x12_8way_ctx.simd, 512 );
|
||||
hamsi512_8way_init( &x12_8way_ctx.hamsi );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &x12_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &x12_8way_ctx.shavite );
|
||||
echo_4way_init( &x12_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &x12_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &x12_8way_ctx.shavite );
|
||||
simd_4way_init( &x12_8way_ctx.simd, 512 );
|
||||
init_echo( &x12_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
hamsi512_8way_init( &x12_8way_ctx.hamsi );
|
||||
};
|
||||
|
||||
void x12_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
x12_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
|
||||
@@ -84,36 +73,20 @@ void x12_8way_hash( void *state, const void *input )
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, vhash );
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
@@ -146,35 +119,14 @@ void x12_8way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
@@ -222,8 +174,6 @@ void x12_8way_hash( void *state, const void *input )
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
@@ -322,10 +272,10 @@ void x12_4way_hash( void *state, const void *input )
|
||||
x12_4way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
|
||||
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
@@ -378,16 +328,16 @@ void x12_4way_hash( void *state, const void *input )
|
||||
|
||||
// Parallel 4way 64 bit
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
|
||||
|
@@ -15,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x12;
|
||||
gate->hash = (void*)&x12hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -10,9 +10,6 @@
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(PHI1612_8WAY)
|
||||
|
||||
@@ -22,11 +19,7 @@ typedef struct {
|
||||
cube_4way_context cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
#if defined(__VAES__)
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} phi1612_8way_ctx_holder;
|
||||
|
||||
phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
|
||||
@@ -38,11 +31,7 @@ void init_phi1612_8way_ctx()
|
||||
cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &phi1612_8way_ctx.fugue );
|
||||
sph_gost512_init( &phi1612_8way_ctx.gost );
|
||||
#if defined(__VAES__)
|
||||
echo_4way_init( &phi1612_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo( &phi1612_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
};
|
||||
|
||||
void phi1612_8way_hash( void *state, const void *input )
|
||||
@@ -129,19 +118,6 @@ void phi1612_8way_hash( void *state, const void *input )
|
||||
sph_gost512_close( &ctx.gost, hash7 );
|
||||
|
||||
// Echo
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
#else
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
@@ -166,8 +142,6 @@ void phi1612_8way_hash( void *state, const void *input )
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
@@ -251,11 +225,11 @@ void phi1612_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
|
||||
|
||||
// Skein parallel 4way
|
||||
skein512_4way_update( &ctx.skein, input, 80 );
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// JH
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// Serial to the end
|
||||
|
@@ -15,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_phi1612;
|
||||
gate->hash = (void*)&phi1612_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -168,7 +168,7 @@ void skunk_4way_hash( void *output, const void *input )
|
||||
skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
|
||||
|
||||
skein512_4way_update( &ctx.skein, input, 80 );
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
|
@@ -17,34 +17,23 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined(X13_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_8way_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_context groestl;
|
||||
shavite512_4way_context shavite;
|
||||
echo_4way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
} x13_8way_ctx_holder;
|
||||
|
||||
x13_8way_ctx_holder x13_8way_ctx;
|
||||
@@ -53,30 +42,22 @@ void init_x13_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x13_8way_ctx.blake );
|
||||
bmw512_8way_init( &x13_8way_ctx.bmw );
|
||||
init_groestl( &x13_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x13_8way_ctx.skein );
|
||||
jh512_8way_init( &x13_8way_ctx.jh );
|
||||
keccak512_8way_init( &x13_8way_ctx.keccak );
|
||||
luffa_4way_init( &x13_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13_8way_ctx.shavite );
|
||||
simd_4way_init( &x13_8way_ctx.simd, 512 );
|
||||
init_echo( &x13_8way_ctx.echo, 512 );
|
||||
hamsi512_8way_init( &x13_8way_ctx.hamsi );
|
||||
sph_fugue512_init( &x13_8way_ctx.fugue );
|
||||
#if defined(__VAES__)
|
||||
groestl512_4way_init( &x13_8way_ctx.groestl, 64 );
|
||||
shavite512_4way_init( &x13_8way_ctx.shavite );
|
||||
echo_4way_init( &x13_8way_ctx.echo, 512 );
|
||||
#else
|
||||
init_groestl( &x13_8way_ctx.groestl, 64 );
|
||||
sph_shavite512_init( &x13_8way_ctx.shavite );
|
||||
init_echo( &x13_8way_ctx.echo, 512 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void x13_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -93,19 +74,6 @@ void x13_8way_hash( void *state, const void *input )
|
||||
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
|
||||
groestl512_4way_init( &ctx.groestl, 64 );
|
||||
groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
@@ -134,9 +102,6 @@ void x13_8way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
@@ -145,27 +110,20 @@ void x13_8way_hash( void *state, const void *input )
|
||||
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_4way_init( &ctx.shavite );
|
||||
shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
@@ -198,27 +156,13 @@ void x13_8way_hash( void *state, const void *input )
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
|
||||
intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
|
||||
echo_4way_init( &ctx.echo, 512 );
|
||||
echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
|
||||
|
||||
rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
@@ -246,9 +190,6 @@ void x13_8way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
@@ -377,11 +318,11 @@ void x13_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
|
||||
|
||||
// 1 Blake
|
||||
blake512_4way_update( &ctx.blake, input, 80 );
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
@@ -400,15 +341,15 @@ void x13_4way_hash( void *state, const void *input )
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
// 4 Skein
|
||||
skein512_4way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// 5 JH
|
||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
jh512_4way_close( &ctx.jh, vhash );
|
||||
|
||||
// 6 Keccak
|
||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
@@ -472,7 +413,7 @@ void x13_4way_hash( void *state, const void *input )
|
||||
|
||||
// 12 Hamsi parallel 4way 32 bit
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user