Compare commits

..

3 Commits
v25.1 ... v25.4

Author SHA1 Message Date
Jay D Dee
66191db93c v25.4 2025-06-20 20:31:41 -04:00
Jay D Dee
dd99580a4c v25.3 2025-01-16 12:31:53 -05:00
Jay D Dee
1ed18bf22e v25.2 2025-01-12 18:58:21 -05:00
94 changed files with 2865 additions and 5034 deletions

View File

@@ -1,31 +1,35 @@
if HAVE_APPLE
# MacOS uses Homebrew to install needed packages but they aren't linked for
# the jansson test in configure. Ignore the failed test & link them now,
# different path for different CPU arch.
if ARCH_ARM64
EXTRA_INCLUDES = -I/opt/homebrew/include
EXTRA_LIBS = -L/opt/homebrew/lib
else
EXTRA_INCLUDES = -I/usr/local/include
EXTRA_LIBS = -L/usr/local/lib
endif
else
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
# Can't find jansson libraries, compile the included source code.
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
else
JANSSON_INCLUDES=
EXTRA_INCLUDES =
EXTRA_LIBS =
endif
# Hook for for GMP on MacOS which is provided by homebrew.
# Homebrew has different linkage on x86_64 & ARM64.
# Need complex expressions, nesting or elseif, none seem to work.
if !HAVE_APPLE
GMP_INCLUDES =
GMP_LIB = -lgmp
endif
if ARM64_APPLE
GMP_INCLUDES = -I/opt/homebrew/include
GMP_LIB = /opt/homebrew/lib/libgmp.a
endif
if X86_64_APPLE
GMP_INCLUDES = -I/usr/local/include
GMP_LIB = /usr/local/lib/libgmp.a
endif
EXTRA_DIST = example-cfg.json nomacro.pl
SUBDIRS = compat
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) $(GMP_INCLUDES) -I.
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
bin_PROGRAMS = cpuminer
@@ -39,6 +43,7 @@ cpuminer_SOURCES = \
sysinfos.c \
algo-gate-api.c\
malloc-huge.c \
simd-utils/simd-constants.c \
algo/argon2d/argon2d-gate.c \
algo/argon2d/blake2/blake2b.c \
algo/argon2d/argon2d/argon2.c \
@@ -288,21 +293,20 @@ cpuminer_SOURCES = \
algo/yespower/yespower-opt.c \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
if USE_ASM
disable_flags =
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags = -DNOASM
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ $(GMP_LIB)
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
@@ -312,9 +316,6 @@ endif
if HAVE_WINDOWS
# Add -U_WIN32_WINNT to command line CFLAGS to undefine
cpuminer_CFLAGS += -D_WIN32_WINNT=0x0601
# use to profile an object
# gprof_cflags = -pg -g3
# cpuminer_LDFLAGS += -pg

View File

@@ -36,34 +36,18 @@ for compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
RPC getwork using http:// or https://.
GBT is YMMV.
RPC getblocktemplate using http:// or https://.
Supported Algorithms
--------------------
@@ -71,9 +55,9 @@ Supported Algorithms
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
argon2d250
argon2d500
argon2d4096
blake Blake-256
blake2b Blake2-512
blake2s Blake2-256

View File

@@ -75,6 +75,27 @@ If not what makes it happen or not happen?
Change Log
----------
v25.4
x86_64: improved handling of vector constants used for byte permutations.
x86_64: removed hooks for cancelled AVX10-256.
Minor bug fixes & improvements.
More code cleanup.
v25.3
#442, #443: Fixed a regression in Makefile.am.
Removed algo features log display.
Some code cleanup.
v25.2
ARM: Fixed regression from v25.1 that could cause build fail.
BSD: FreeBSD is now supported. Other BSDs may also work.
MacOS: build with installed jansson library instead of compiling the included source code.
Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
Changed build.sh shell from bash to sh.
v25.1
MacOS ARM64: m7m algo is now working.

View File

@@ -295,8 +295,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
{
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
@@ -416,8 +416,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
const char* const algo_alias_map[][2] =
{
// alias proper
{ "argon2d-dyn", "argon2d500" },
{ "argon2d-uis", "argon2d4096" },
{ "bcd", "x13bcd" },
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },

View File

@@ -6,9 +6,7 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
// Credits
void argon2d_crds_hash( void *output, const void *input )
void argon2d250_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -34,7 +32,7 @@ void argon2d_crds_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
@@ -50,7 +48,7 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
do {
be32enc(&edata[19], nonce);
argon2d_crds_hash( hash, edata );
argon2d250_hash( hash, edata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
@@ -64,18 +62,16 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
return 0;
}
bool register_argon2d_crds_algo( algo_gate_t* gate )
bool register_argon2d250_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_crds;
gate->hash = (void*)&argon2d_crds_hash;
gate->scanhash = (void*)&scanhash_argon2d250;
gate->hash = (void*)&argon2d250_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
// Dynamic
void argon2d_dyn_hash( void *output, const void *input )
void argon2d500_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -101,7 +97,7 @@ void argon2d_dyn_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
@@ -118,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
do
{
edata[19] = nonce;
argon2d_dyn_hash( hash, edata );
argon2d500_hash( hash, edata );
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !bench ) )
{
@@ -133,17 +129,15 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
return 0;
}
bool register_argon2d_dyn_algo( algo_gate_t* gate )
bool register_argon2d500_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_dyn;
gate->hash = (void*)&argon2d_dyn_hash;
gate->scanhash = (void*)&scanhash_argon2d500;
gate->hash = (void*)&argon2d500_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
// Unitus
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{

View File

@@ -5,19 +5,19 @@
#include <stdint.h>
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d_crds_algo( algo_gate_t* gate );
bool register_argon2d250_algo( algo_gate_t* gate );
void argon2d_crds_hash( void *state, const void *input );
void argon2d250_hash( void *state, const void *input );
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d_dyn_algo( algo_gate_t* gate );
bool register_argon2d500_algo( algo_gate_t* gate );
void argon2d_dyn_hash( void *state, const void *input );
void argon2d500_hash( void *state, const void *input );
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -6,15 +6,15 @@
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_4w_ctx;
blake256r14_4x32_context blake_4w_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
blake256r14_4x32_context ctx;
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
blake256r14_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
blake256r14_4x32_init( &blake_4w_ctx );
blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
#if defined(BLAKE_8WAY)
blake256r14_8way_context blake_8w_ctx;
blake256r14_8x32_context blake_8w_ctx;
void blakehash_8way( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r14_8way_context ctx;
blake256r14_8x32_context ctx;
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
blake256r14_8way( &ctx, input + (64<<3), 16 );
blake256r14_8way_close( &ctx, vhash );
blake256r14_8x32( &ctx, input + (64<<3), 16 );
blake256r14_8x32_close( &ctx, vhash );
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256r14_8way_init( &blake_8w_ctx );
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
blake256r14_8x32_init( &blake_8w_ctx );
blake256r14_8x32( &blake_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,

View File

@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
(state)->T1 = T1; \
} while (0)
#if defined(__SSSE3__)
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
}
#else // SSE2
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
MF = v128_bswap32( buf[15] ); \
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4X32( rounds ) \
{ \
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
ROUND_S_4X32_3;
}
#if defined(__SSSE3__)
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
#endif
}
#if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
VD = v256_32( T0 ^ 0x299F31D0 ); \
VE = v256_32( T1 ^ 0x082EFA98 ); \
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
const __m256i shuf_bswap32 = mm256_set2_64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+ 1) ); \
M2 = mm256_bswap_32( *(buf+ 2) ); \
M3 = mm256_bswap_32( *(buf+ 3) ); \
M4 = mm256_bswap_32( *(buf+ 4) ); \
M5 = mm256_bswap_32( *(buf+ 5) ); \
M6 = mm256_bswap_32( *(buf+ 6) ); \
M7 = mm256_bswap_32( *(buf+ 7) ); \
M8 = mm256_bswap_32( *(buf+ 8) ); \
M9 = mm256_bswap_32( *(buf+ 9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
H7 = mm256_xor3( VF, V7, H7 ); \
}
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m256i *H = (__m256i*)final_hash;
@@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_8WAY_3;
}
const __m256i shuf_bswap32 =
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
}
#endif
@@ -1933,8 +1881,6 @@ do { \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
VD = v512_32( T0 ^ 0x299F31D0 ); \
VE = v512_32( T1 ^ 0x082EFA98 ); \
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm512_bswap_32( * buf ); \
M1 = mm512_bswap_32( *(buf+ 1) ); \
M2 = mm512_bswap_32( *(buf+ 2) ); \
M3 = mm512_bswap_32( *(buf+ 3) ); \
M4 = mm512_bswap_32( *(buf+ 4) ); \
M5 = mm512_bswap_32( *(buf+ 5) ); \
M6 = mm512_bswap_32( *(buf+ 6) ); \
M7 = mm512_bswap_32( *(buf+ 7) ); \
M8 = mm512_bswap_32( *(buf+ 8) ); \
M9 = mm512_bswap_32( *(buf+ 9) ); \
MA = mm512_bswap_32( *(buf+10) ); \
MB = mm512_bswap_32( *(buf+11) ); \
MC = mm512_bswap_32( *(buf+12) ); \
MD = mm512_bswap_32( *(buf+13) ); \
ME = mm512_bswap_32( *(buf+14) ); \
MF = mm512_bswap_32( *(buf+15) ); \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
// is constant for every nonce and only needs to be run once per job. The
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
}
// Dfault is 14 rounds, blakecoin & vanilla are 8.
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
}
// Byte swap final hash
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
}
#endif
// Blake-256 4 way
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
static void
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
int rounds )
{
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
// Blake-256 8 way
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static void
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
}
static void
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
//Blake-256 16 way AVX512
static void
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
}
static void
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
sc->ptr = ptr;
}
static void
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
}
static void
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
void
blake256_16way_init(void *cc)
blake256_16x32_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256_16way_update(void *cc, const void *data, size_t len)
blake256_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256_16way_close(void *cc, void *dst)
blake256_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void
blake256_16way_update_le(void *cc, const void *data, size_t len)
blake256_16x32_update_le(void *cc, const void *data, size_t len)
{
blake32_16way_le(cc, data, len);
}
void
blake256_16way_close_le(void *cc, void *dst)
blake256_16x32_close_le(void *cc, void *dst)
{
blake32_16way_close_le(cc, 0, 0, dst, 8);
}
void blake256r14_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256r14_16way_update(void *cc, const void *data, size_t len)
blake256r14_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r14_16way_close(void *cc, void *dst)
blake256r14_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void blake256r8_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_16way_init( cc, IV256, 8 );
}
void
blake256r8_16way_update(void *cc, const void *data, size_t len)
blake256r8_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r8_16way_close(void *cc, void *dst)
blake256r8_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
void
blake256_4x32_init(void *ctx)
{
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( ctx, IV256, 14 );
}
void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
// Blake-256 8 way
void
blake256_8way_init(void *cc)
blake256_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256_8way_update(void *cc, const void *data, size_t len)
blake256_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256_8way_close(void *cc, void *dst)
blake256_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
void
blake256_8way_update_le(void *cc, const void *data, size_t len)
blake256_8x32_update_le(void *cc, const void *data, size_t len)
{
blake32_8way_le(cc, data, len);
}
void
blake256_8way_close_le(void *cc, void *dst)
blake256_8x32_close_le(void *cc, void *dst)
{
blake32_8way_close_le(cc, 0, 0, dst, 8);
}
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
// 14 rounds Blake, Decred
void blake256r14_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( cc, IV256, 14 );
}
void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)
#if defined(__AVX2__)
void blake256r14_8way_init(void *cc)
void blake256r14_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256r14_8way_update(void *cc, const void *data, size_t len)
blake256r14_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r14_8way_close(void *cc, void *dst)
blake256r14_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
// 8 rounds Blakecoin, Vanilla
void blake256r8_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
blake32_4x32_init( cc, IV256, 8 );
}
void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)
#if defined (__AVX2__)
void blake256r8_8way_init(void *cc)
void blake256r8_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_8way_init( cc, IV256, 8 );
}
void
blake256r8_8way_update(void *cc, const void *data, size_t len)
blake256r8_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r8_8way_close(void *cc, void *dst)
blake256r8_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}

View File

@@ -29,13 +29,6 @@ typedef struct
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds );
/*
void blake256_init( blake256_context *sc );
void blake256_update( blake256_context *sc, const void *data, size_t len );
void blake256_close( blake256_context *sc, void *dst );
void blake256_full( blake256_context *sc, void *dst, const void *data,
size_t len );
*/
//////////////////////////////////
//
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
void blake256_4x32_init(void *ctx);
void blake256_4x32_update(void *ctx, const void *data, size_t len);
void blake256_4x32_close(void *ctx, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds
typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
void blake256r8_4x32_close(void *cc, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
#define blake_4way_small_context blake256_4x32_context
#define blake256_4way_context blake256_4x32_context
#define blake256_4way_init blake256_4x32_init
#define blake256_4way_update blake256_4x32_update
#define blake256_4way_close blake256_4x32_close
#define blake256_4way_update_le blake256_4x32_update_le
#define blake256_4way_close_le blake256_4x32_close_le
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
#define blake256r14_4way_context blake256r14_4x32_context
#define blake256r14_4way_init blake256r14_4x32_init
#define blake256r14_4way_update blake256r14_4x32_update
#define blake256r14_4way_close blake256r14_4x32_close
#define blake256r8_4way_context blake256r14_4x32_context
#define blake256r8_4way_init blake256r14_4x32_init
#define blake256r8_4way_update blake256r14_4x32_update
#define blake256r8_4way_close blake256r14_4x32_close
#ifdef __AVX2__
//////////////////////////////
@@ -107,45 +81,28 @@ typedef struct
} blake_8way_small_context;
// Default 14 rounds
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
typedef blake_8way_small_context blake256_8x32_context;
void blake256_8x32_init(void *cc);
void blake256_8x32_update(void *cc, const void *data, size_t len);
void blake256_8x32_close(void *cc, void *dst);
void blake256_8x32_update_le(void *cc, const void *data, size_t len);
void blake256_8x32_close_le(void *cc, void *dst);
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
void blake256r14_8way_init(void *cc);
void blake256r14_8way_update(void *cc, const void *data, size_t len);
void blake256r14_8way_close(void *cc, void *dst);
typedef blake_8way_small_context blake256r14_8x32_context;
void blake256r14_8x32_init(void *cc);
void blake256r14_8x32_update(void *cc, const void *data, size_t len);
void blake256r14_8x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_8way_small_context blake256r8_8way_context;
void blake256r8_8way_init(void *cc);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
#define blake_8x32_small_context blake256_8way_context
#define blake_8x32_init blake256_8way_init
#define blake_8x32_update blake256_8way_update
#define blake_8x32_close blake256_8way_close
#define blake_8x32_update_le blake256_8way_update_le
#define blake_8x32_close_le blake256_8way_close_le
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
#define blake256r14_8x32_context blake256r14_8way_context
#define blake256r14_8x32_init blake256r14_8way_init
#define blake256r14_8x32_update blake256r14_8way_update
#define blake256r14_8x32_close blake256r14_8way_close
#define blake256r8_8x32_context blake256r14_8way_context
#define blake256r8_8x32_init blake256r14_8way_init
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
typedef blake_8way_small_context blake256r8_8x32_context;
void blake256r8_8x32_init(void *cc);
void blake256r8_8x32_update(void *cc, const void *data, size_t len);
void blake256r8_8x32_close(void *cc, void *dst);
#if defined(SIMD512)
@@ -163,46 +120,29 @@ typedef struct
} blake_16way_small_context __attribute__ ((aligned (128)));
// Default 14 rounds
typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256_16x32_context;
void blake256_16x32_init(void *cc);
void blake256_16x32_update(void *cc, const void *data, size_t len);
void blake256_16x32_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_update_le(void *cc, const void *data, size_t len);
void blake256_16x32_close_le(void *cc, void *dst);
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
void blake256r14_16way_init(void *cc);
void blake256r14_16way_update(void *cc, const void *data, size_t len);
void blake256r14_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256r14_16x32_context;
void blake256r14_16x32_init(void *cc);
void blake256r14_16x32_update(void *cc, const void *data, size_t len);
void blake256r14_16x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_16way_small_context blake256r8_16way_context;
void blake256r8_16way_init(void *cc);
void blake256r8_16way_update(void *cc, const void *data, size_t len);
void blake256r8_16way_close(void *cc, void *dst);
#define blake_16x32_small_context blake256_16way_context
#define blake_16x32_init blake256_16way_init
#define blake_16x32_update blake256_16way_update
#define blake_16x32_close blake256_16way_close
#define blake_16x32_update_le blake256_16way_update_le
#define blake_16x32_close_le blake256_16way_close_le
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
#define blake256r14_16x32_context blake256r14_16way_context
#define blake256r14_16x32_init blake256r14_16way_init
#define blake256r14_16x32_update blake256r14_16way_update
#define blake256r14_16x32_close blake256r14_16way_close
#define blake256r8_16x32_context blake256r8_16way_context
#define blake256r8_16x32_init blake256r8_16way_init
#define blake256r8_16x32_update blake256r8_16way_update
#define blake256r8_16x32_close blake256r8_16way_close
typedef blake_16way_small_context blake256r8_16x32_context;
void blake256r8_16x32_init(void *cc);
void blake256r8_16x32_update(void *cc, const void *data, size_t len);
void blake256r8_16x32_close(void *cc, void *dst);
#endif // AVX512
#endif // AVX2

View File

@@ -14,7 +14,6 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
@@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
#if defined(__AVX2__)
@@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif
#endif

View File

@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
blake2b_8x64_init( &ctx );
blake2b_8x64_update( &ctx, vdata, 80 );
blake2b_8x64_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
blake2b_4x64_ctx ctx;
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, input, 80 );
blake2b_4x64_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, vdata, 80 );
blake2b_4x64_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )

View File

@@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_4x32_state blake2s_4way_state
#define blake2s_4x32_init blake2s_4way_init
#define blake2s_4x32_update blake2s_4way_update
#define blake2s_4x32_final blake2s_4way_final
#define blake2s_4x32_full_blocks blake2s_4way_full_blocks
#if defined(__AVX2__)
@@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_8x32_state blake2s_8way_state
#define blake2s_8x32_init blake2s_8way_init
#define blake2s_8x32_update blake2s_8way_update
#define blake2s_8x32_final blake2s_8way_final
#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
#endif
#if defined(SIMD512)
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
uint64_t inlen );
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
#define blake2s_16x32_state blake2s_16way_state
#define blake2s_16x32_init blake2s_16way_init
#define blake2s_16x32_update blake2s_16way_update
#define blake2s_16x32_final blake2s_16way_final
#endif
#if 0

View File

@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
VD = v512_64( CB5 ^ T0 ); \
VE = v512_64( CB6 ^ T1 ); \
VF = v512_64( CB7 ^ T1 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm512_bswap_64( *(buf+ 0) ); \
M1 = mm512_bswap_64( *(buf+ 1) ); \
M2 = mm512_bswap_64( *(buf+ 2) ); \
M3 = mm512_bswap_64( *(buf+ 3) ); \
M4 = mm512_bswap_64( *(buf+ 4) ); \
M5 = mm512_bswap_64( *(buf+ 5) ); \
M6 = mm512_bswap_64( *(buf+ 6) ); \
M7 = mm512_bswap_64( *(buf+ 7) ); \
M8 = mm512_bswap_64( *(buf+ 8) ); \
M9 = mm512_bswap_64( *(buf+ 9) ); \
MA = mm512_bswap_64( *(buf+10) ); \
MB = mm512_bswap_64( *(buf+11) ); \
MC = mm512_bswap_64( *(buf+12) ); \
MD = mm512_bswap_64( *(buf+13) ); \
ME = mm512_bswap_64( *(buf+14) ); \
MF = mm512_bswap_64( *(buf+15) ); \
ROUND_B_8WAY(0); \
ROUND_B_8WAY(1); \
ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
H7 = mm512_xor3( VF, V7, H7 ); \
}
void blake512_8way_compress( blake_8way_big_context *sc )
void blake512_8x64_compress( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
VE = v512_64( CB6 ^ sc->T1 );
VF = v512_64( CB7 ^ sc->T1 );
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm512_bswap_64( sc->buf[ 0] );
M1 = mm512_bswap_64( sc->buf[ 1] );
M2 = mm512_bswap_64( sc->buf[ 2] );
M3 = mm512_bswap_64( sc->buf[ 3] );
M4 = mm512_bswap_64( sc->buf[ 4] );
M5 = mm512_bswap_64( sc->buf[ 5] );
M6 = mm512_bswap_64( sc->buf[ 6] );
M7 = mm512_bswap_64( sc->buf[ 7] );
M8 = mm512_bswap_64( sc->buf[ 8] );
M9 = mm512_bswap_64( sc->buf[ 9] );
MA = mm512_bswap_64( sc->buf[10] );
MB = mm512_bswap_64( sc->buf[11] );
MC = mm512_bswap_64( sc->buf[12] );
MD = mm512_bswap_64( sc->buf[13] );
ME = mm512_bswap_64( sc->buf[14] );
MF = mm512_bswap_64( sc->buf[15] );
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8x64_big_context *sc )
void blake512_8x64_compress_le( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
sc->ptr = 0;
}
@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
sc->ptr = 0;
}
@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
VD = v256_64( CB5 ^ T0 ); \
VE = v256_64( CB6 ^ T1 ); \
VF = v256_64( CB7 ^ T1 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm256_bswap_64( *(buf+ 0) ); \
M1 = mm256_bswap_64( *(buf+ 1) ); \
M2 = mm256_bswap_64( *(buf+ 2) ); \
M3 = mm256_bswap_64( *(buf+ 3) ); \
M4 = mm256_bswap_64( *(buf+ 4) ); \
M5 = mm256_bswap_64( *(buf+ 5) ); \
M6 = mm256_bswap_64( *(buf+ 6) ); \
M7 = mm256_bswap_64( *(buf+ 7) ); \
M8 = mm256_bswap_64( *(buf+ 8) ); \
M9 = mm256_bswap_64( *(buf+ 9) ); \
MA = mm256_bswap_64( *(buf+10) ); \
MB = mm256_bswap_64( *(buf+11) ); \
MC = mm256_bswap_64( *(buf+12) ); \
MD = mm256_bswap_64( *(buf+13) ); \
ME = mm256_bswap_64( *(buf+14) ); \
MF = mm256_bswap_64( *(buf+15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
}
void blake512_4way_compress( blake_4x64_big_context *sc )
void blake512_4x64_compress( blake_4x64_big_context *sc )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
VD = v256_64( CB5 ^ sc->T0 );
VE = v256_64( CB6 ^ sc->T1 );
VF = v256_64( CB7 ^ sc->T1 );
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm256_bswap_64( sc->buf[ 0] );
M1 = mm256_bswap_64( sc->buf[ 1] );
M2 = mm256_bswap_64( sc->buf[ 2] );
M3 = mm256_bswap_64( sc->buf[ 3] );
M4 = mm256_bswap_64( sc->buf[ 4] );
M5 = mm256_bswap_64( sc->buf[ 5] );
M6 = mm256_bswap_64( sc->buf[ 6] );
M7 = mm256_bswap_64( sc->buf[ 7] );
M8 = mm256_bswap_64( sc->buf[ 8] );
M9 = mm256_bswap_64( sc->buf[ 9] );
MA = mm256_bswap_64( sc->buf[10] );
MB = mm256_bswap_64( sc->buf[11] );
MC = mm256_bswap_64( sc->buf[12] );
MD = mm256_bswap_64( sc->buf[13] );
ME = mm256_bswap_64( sc->buf[14] );
MF = mm256_bswap_64( sc->buf[15] );
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
midstate[15] = VF;
}
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
}
void blake512_4x64_init( blake_4x64_big_context *sc )
void blake512_4x64_init( blake512_4x64_context *sc )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
}
// init, update & close
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
const void *data, size_t len )
{
@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
sc->ptr = 0;
}
@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
VE = v128_64( CB6 ^ sc->T1 );
VF = v128_64( CB7 ^ sc->T1 );
#if defined(__SSSE3__)
const v128u64_t shuf_bswap64 = v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 );
M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
#else // SSE2 & NEON
M0 = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 1] );
M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
ME = v128_bswap64( sc->buf[14] );
MF = v128_bswap64( sc->buf[15] );
#endif
ROUND_B_2X64(0);
ROUND_B_2X64(1);
ROUND_B_2X64(2);

View File

@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
#elif defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
blake256r8_4x32_context blakecoin_4w_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
blake256r8_4x32_context ctx;
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
blake256r8_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
blake256r8_4x32_init( &blakecoin_4w_ctx );
blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );

View File

@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
blake512_4way_context ctx;
blake512_4x64_context ctx;
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, input, 80 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, input, 80 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );

View File

@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
v128_t *V = (v128_t*)v;
@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
V[3] = v128_swap64( V[3] ); \
V[2] = v128_shufll32( V[2] )
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
#undef BLAKE2S_ROUND
#else
#define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
b = SPH_ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
#define BLAKE2S_ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
#endif
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
for( size_t i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
#undef BLAKE2S_ROUND
return 0;
}

View File

@@ -39,16 +39,14 @@
#include <stddef.h>
#include "simd-utils.h"
#define SPH_SIZE_bmw256 256
#define SPH_SIZE_bmw512 512
// BMW-256 4 way 32
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct
{
v128_t buf[64];
v128_t H[16];
v128u32_t buf[64];
v128u32_t H[16];
size_t ptr;
uint32_t bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
void bmw256_4way_init( bmw256_4way_context *ctx );
void bmw256_4way_update(void *cc, const void *data, size_t len);
#define bmw256_4way bmw256_4way_update
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define bmw256_4x32_context bmw256_4way_context
#define bmw256_4x32_init bmw256_4way_init
#define bmw256_4x32_update bmw256_4way_update
#define bmw256_4x32_close bmw256_4way_close
#endif
#if defined(__AVX2__)
// BMW-256 8 way 32
@@ -85,6 +89,11 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
#define bmw256_8way bmw256_8way_update
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#define bmw256_8x32_context bmw256_8way_context
#define bmw256_8x32_init bmw256_8way_init
#define bmw256_8x32_update bmw256_8way_update
#define bmw256_8x32_close bmw256_8way_close
#endif
#if defined(SIMD512)
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
size_t len );
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
#define bmw256_16x32_context bmw256_16way_context
#define bmw256_16x32_init bmw256_16way_init
#define bmw256_16x32_update bmw256_16way_update
#define bmw256_16x32_close bmw256_16way_close
#endif
// BMW-512 2 way 64

View File

@@ -45,7 +45,7 @@ extern "C"{
#define LPAR (
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
// BMW-256 4 way 32
/*
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
v128_xor( M[13], H[13] ) ) )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
{
v128u64_t qt[32], xl, xh; \
v128u32_t qt[32], xl, xh; \
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = v128_32( iv[i] );
ctx->H[ 0] = v128_32( 0x40414243 );
ctx->H[ 1] = v128_32( 0x44454647 );
ctx->H[ 2] = v128_32( 0x48494A4B );
ctx->H[ 3] = v128_32( 0x4C4D4E4F );
ctx->H[ 4] = v128_32( 0x50515253 );
ctx->H[ 5] = v128_32( 0x54555657 );
ctx->H[ 6] = v128_32( 0x58595A5B );
ctx->H[ 7] = v128_32( 0x5C5D5E5F );
ctx->H[ 8] = v128_32( 0x60616263 );
ctx->H[ 9] = v128_32( 0x64656667 );
ctx->H[10] = v128_32( 0x68696A6B );
ctx->H[11] = v128_32( 0x6C6D6E6F );
ctx->H[12] = v128_32( 0x70717273 );
ctx->H[13] = v128_32( 0x74757677 );
ctx->H[14] = v128_32( 0x78797A7B );
ctx->H[15] = v128_32( 0x7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
v128u32_t *vdata = (v128u32_t*)data;
v128u32_t *buf;
v128u32_t htmp[16];
v128u32_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
ptr += clen;
if ( ptr == buf_size )
{
v128u64_t *ht;
v128u32_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -521,14 +497,14 @@ static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
v128u32_t *buf;
v128u32_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_32( 0x00000080 );
ptr += 4;
h = sc->H;
@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (v128u64_t*)final_s, h1 );
compress_small( buf, (v128u32_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_v128( dst, u ) = h1[v];

View File

@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )
#elif defined(__AVX2__)
register __m256i x0, x1, x2, x3, y0, y1;
register __m256i x0, x1, x2, x3, t0;
x0 = _mm256_load_si256( (__m256i*)sp->x );
x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
{
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = mm256_rol_32( x1, 7 );
y1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( y0, x2 );
x1 = _mm256_xor_si256( y1, x3 );
t0 = mm256_rol_32( x1, 7 );
x1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( t0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap128_64( x2 );
x3 = mm256_swap128_64( x3 );
x2 = _mm256_add_epi32( x0, x2 );
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )
#else // AVX, SSE2, NEON
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;
x0 = casti_v128( sp->x, 0 );
x1 = casti_v128( sp->x, 1 );
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = v128_rol32( y0, 7 );
x1 = v128_rol32( y1, 7 );
x2 = v128_rol32( y2, 7 );
x3 = v128_rol32( y3, 7 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
t0 = v128_rol32( x2, 7 );
t1 = v128_rol32( x3, 7 );
x2 = v128_rol32( x0, 7 );
x3 = v128_rol32( x1, 7 );
x0 = v128_xor( t0, x4 );
x1 = v128_xor( t1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64( x4 );
@@ -112,19 +108,15 @@ static void transform( cubehashParam *sp )
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = v128_rol32( y0, 11 );
x1 = v128_rol32( y1, 11 );
x2 = v128_rol32( y2, 11 );
x3 = v128_rol32( y3, 11 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64_32( x4 );
t0 = v128_rol32( x1, 11 );
x1 = v128_rol32( x0, 11 );
t1 = v128_rol32( x3, 11 );
x3 = v128_rol32( x2, 11 );
x0 = v128_xor( t0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( t1, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64_32( x4 );
x5 = v128_swap64_32( x5 );
x6 = v128_swap64_32( x6 );
x7 = v128_swap64_32( x7 );

View File

@@ -17,7 +17,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
sha256_8way_context sha;
sha256_8x32_context sha;
} myrgr_8way_ctx_holder;
myrgr_8way_ctx_holder myrgr_8way_ctx;
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
#else
init_groestl( &myrgr_8way_ctx.groestl, 64 );
#endif
sha256_8way_init( &myrgr_8way_ctx.sha );
sha256_8x32_init( &myrgr_8way_ctx.sha );
}
void myriad_8way_hash( void *output, const void *input )
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha256_8way_update( &ctx.sha, vhash, 64 );
sha256_8way_close( &ctx.sha, output );
sha256_8x32_update( &ctx.sha, vhash, 64 );
sha256_8x32_close( &ctx.sha, output );
}
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
typedef struct {
hashState_groestl groestl;
sha256_4way_context sha;
sha256_4x32_context sha;
} myrgr_4way_ctx_holder;
myrgr_4way_ctx_holder myrgr_4way_ctx;
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
void init_myrgr_4way_ctx()
{
init_groestl (&myrgr_4way_ctx.groestl, 64 );
sha256_4way_init( &myrgr_4way_ctx.sha );
sha256_4x32_init( &myrgr_4way_ctx.sha );
}
void myriad_4way_hash( void *output, const void *input )
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way_update( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, output );
sha256_4x32_update( &ctx.sha, vhash, 64 );
sha256_4x32_close( &ctx.sha, output );
}
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,

View File

@@ -1059,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
WRITE_STATE_BIG8( sc );
}
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
{
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
@@ -1071,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
void hamsi512_8x64_init( hamsi512_8x64_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
@@ -1087,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
sc->h[7] = v512_64( iv[7] );
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -1099,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
sc->partial_len = len;
}
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
{
__m512i pad[1];
uint32_t ch, cl;
@@ -1944,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,
////////////
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
{
DECL_STATE_BIG
uint32_t tmp;
@@ -1968,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
WRITE_STATE_BIG( sc );
}
void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
{
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_BIG
@@ -1979,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
WRITE_STATE_BIG( sc );
}
void hamsi512_4way_init( hamsi_4way_big_context *sc )
void hamsi512_4x64_init( hamsi512_4x64_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
@@ -1994,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
sc->h[7] = v256_64( iv[7] );
}
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
@@ -2006,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
sc->partial_len = len;
}
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
{
__m256i pad[1];
uint32_t ch, cl;

View File

@@ -72,17 +72,17 @@ typedef struct
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
typedef hamsi_4way_big_context hamsi512_4x64_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
void hamsi512_4x64_init( hamsi512_4x64_context *sc );
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
size_t len );
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );
#define hamsi512_4x64_context hamsi512_4way_context
#define hamsi512_4x64_init hamsi512_4way_init
#define hamsi512_4x64_update hamsi512_4way_update
#define hamsi512_4x64_close hamsi512_4way_close
#define hamsi512_4way_context hamsi512_4x64_context
#define hamsi512_4way_init hamsi512_4x64_init
#define hamsi512_4way_update hamsi512_4x64_update
#define hamsi512_4way_close hamsi512_4x64_close
// Hamsi-512 8x32
@@ -115,17 +115,17 @@ typedef struct
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_8way_big_context;
typedef hamsi_8way_big_context hamsi512_8way_context;
typedef hamsi_8way_big_context hamsi512_8x64_context;
void hamsi512_8way_init( hamsi512_8way_context *sc );
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
void hamsi512_8x64_init( hamsi512_8x64_context *sc );
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
size_t len );
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );
#define hamsi512_8x64_context hamsi512_8way_context
#define hamsi512_8x64_init hamsi512_8way_init
#define hamsi512_8x64_update hamsi512_8way_update
#define hamsi512_8x64_close hamsi512_8way_close
#define hamsi512_8way_context hamsi512_8x64_context
#define hamsi512_8way_init hamsi512_8x64_init
#define hamsi512_8way_update hamsi512_8x64_update
#define hamsi512_8way_close hamsi512_8x64_close
// Hamsi-512 16x32

View File

@@ -82,12 +82,15 @@ typedef struct {
typedef haval_4way_context haval256_5_4way_context;
void haval256_5_4way_init( void *cc );
void haval256_5_4way_update( void *cc, const void *data, size_t len );
//#define haval256_5_4way haval256_5_4way_update
void haval256_5_4way_close( void *cc, void *dst );
#define haval256_4x32_context haval256_5_4way_context
#define haval256_4x32_init haval256_5_4way_init
#define haval256_4x32_update haval256_5_4way_update
#define haval256_4x32_close haval256_5_4way_close
#if defined(__AVX2__)
typedef struct {
@@ -100,11 +103,14 @@ typedef struct {
typedef haval_8way_context haval256_5_8way_context;
void haval256_5_8way_init( void *cc );
void haval256_5_8way_update( void *cc, const void *data, size_t len );
void haval256_5_8way_close( void *cc, void *dst );
#define haval256_8x32_context haval256_5_8way_context
#define haval256_8x32_init haval256_5_8way_init
#define haval256_8x32_update haval256_5_8way_update
#define haval256_8x32_close haval256_5_8way_close
#endif // AVX2
#if defined(SIMD512)
@@ -119,11 +125,14 @@ typedef struct {
typedef haval_16way_context haval256_5_16way_context;
void haval256_5_16way_init( void *cc );
void haval256_5_16way_update( void *cc, const void *data, size_t len );
void haval256_5_16way_close( void *cc, void *dst );
#define haval256_16x32_context haval256_5_16way_context
#define haval256_16x32_init haval256_5_16way_init
#define haval256_16x32_update haval256_5_16way_update
#define haval256_16x32_close haval256_5_16way_close
#endif // AVX512
#ifdef __cplusplus

View File

@@ -190,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
memcpy_512( dst, kc->w, m512_len );
}
void keccak256_8way_init( void *kc )
void keccak256_8x64_init( void *kc )
{
keccak64_8way_init( kc, 256 );
}

View File

@@ -9,7 +9,7 @@
void sha3d_hash_8way(void *state, const void *input)
{
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
keccak256_8way_context ctx;
keccak256_8x64_context ctx;
keccak256_8x64_init( &ctx );
keccak256_8x64_update( &ctx, input, 80 );
@@ -69,7 +69,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
void sha3d_hash_4way(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4x64_context ctx;
keccak256_4x64_init( &ctx );
keccak256_4x64_update( &ctx, input, 80 );

View File

@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
uint32_t hash[8*4] __attribute((aligned(128)));
__m512i* chainv = state->chainv;
__m512i t[2];
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
/*---- blank round with m=0 ----*/
rnd512_4way( state, NULL );
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
casti_m512i( b,0 ) = _mm512_shuffle_epi8(
casti_m512i( hash,0 ), shuff_bswap32 );
casti_m512i( b,1 ) = _mm512_shuffle_epi8(
casti_m512i( hash,1 ), shuff_bswap32 );
casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
rnd512_4way( state, NULL );
@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
casti_m512i( b,2 ) = _mm512_shuffle_epi8(
casti_m512i( hash,0 ), shuff_bswap32 );
casti_m512i( b,3 ) = _mm512_shuffle_epi8(
casti_m512i( hash,1 ), shuff_bswap32 );
casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
}
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
__m512i msg[2];
int i;
int blocks = (int)len >> 5;
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[0] = mm512_bswap_32( vdata[0] );
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
}
return 0;
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
__m512i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
rnd512_4way( state, msg );
}
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m512i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
rnd512_4way( state, msg );
}
@@ -775,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
uint32 hash[8*2] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t0, t1;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/
rnd512_2way( state, NULL );
@@ -791,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, NULL );
@@ -809,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
@@ -847,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
__m256i msg[2];
int i;
int blocks = (int)len >> 5;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state-> rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -864,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
}
return 0;
@@ -916,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -933,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
rnd512_2way( state, msg );
}
@@ -961,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -978,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
rnd512_2way( state, msg );
}

View File

@@ -26,9 +26,9 @@
#if defined (ALLIUM_16WAY)
typedef union {
keccak256_8way_context keccak;
keccak256_8x64_context keccak;
cube_4way_2buf_context cube;
skein256_8way_context skein;
skein256_8x64_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
#else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhashA);
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashB, 32 );
keccak256_8way_close( &ctx.keccak, vhashB);
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
keccak256_8x64_close( &ctx.keccak, vhashA);
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
keccak256_8x64_close( &ctx.keccak, vhashB);
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashA, 32 );
skein256_8way_close( &ctx.skein, vhashA );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashB, 32 );
skein256_8way_close( &ctx.skein, vhashB );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhashA, 32 );
skein256_8x64_close( &ctx.skein, vhashA );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhashB, 32 );
skein256_8x64_close( &ctx.skein, vhashB );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
#elif defined (ALLIUM_8WAY)
typedef union {
keccak256_4way_context keccak;
keccak256_4x64_context keccak;
cube_2way_context cube;
skein256_4way_context skein;
skein256_4x64_context skein;
#if defined(__VAES__)
groestl256_2way_context groestl;
#else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
keccak256_4way_close( &ctx.keccak, vhashA );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashB, 32 );
keccak256_4way_close( &ctx.keccak, vhashB );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
keccak256_4x64_close( &ctx.keccak, vhashA );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
keccak256_4x64_close( &ctx.keccak, vhashB );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashA, 32 );
skein256_4way_close( &ctx.skein, vhashA );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashB, 32 );
skein256_4way_close( &ctx.skein, vhashB );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhashA, 32 );
skein256_4x64_close( &ctx.skein, vhashA );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhashB, 32 );
skein256_4x64_close( &ctx.skein, vhashB );
#if defined(__VAES__)
@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
uint64_t *hash3 = (uint64_t*)hash+12;
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
block_buf[15] = v128_32( 640 );
// Partialy prehash second block without touching nonces
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
//
// 1 way
typedef struct
{
blake256_context blake;

View File

@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
static __thread blake256_4x32_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
blake256_4x32_init( &l2h_4way_blake_mid );
blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
blake256_4x32_close( &ctx_blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

View File

@@ -7,25 +7,24 @@
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#if defined (LYRA2REV2_16WAY)
typedef struct {
blake256_16way_context blake;
keccak256_8way_context keccak;
blake256_16x32_context blake;
keccak256_8x64_context keccak;
cubehashParam cube;
skein256_8way_context skein;
bmw256_16way_context bmw;
skein256_8x64_context skein;
bmw256_16x32_context bmw;
} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
bool init_lyra2rev2_16way_ctx()
{
keccak256_8way_init( &l2v2_16way_ctx.keccak );
keccak256_8x64_init( &l2v2_16way_ctx.keccak );
cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &l2v2_16way_ctx.skein );
bmw256_16way_init( &l2v2_16way_ctx.bmw );
skein256_8x64_init( &l2v2_16way_ctx.skein );
bmw256_16x32_init( &l2v2_16way_ctx.bmw );
return true;
}
@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
blake256_16x32_close( &ctx.blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
keccak256_8way_update( &ctx.keccak, vhash, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
keccak256_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhash, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
keccak256_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64( hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
skein256_8x64_update( &ctx.skein, vhash, 32 );
skein256_8x64_close( &ctx.skein, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12,
hash13, hash14, hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhash, 32 );
skein256_8x64_close( &ctx.skein, vhash );
dintrlv_8x64( hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, 256 );
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
bmw256_16x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
blake256_16way_init( &l2v2_16way_ctx.blake );
blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
blake256_16x32_init( &l2v2_16way_ctx.blake );
blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );
do
{
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV2_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_4way_context keccak;
blake256_8x32_context blake;
keccak256_4x64_context keccak;
cubehashParam cube;
skein256_4way_context skein;
bmw256_8way_context bmw;
skein256_4x64_context skein;
bmw256_8x32_context bmw;
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
bool init_lyra2rev2_8way_ctx()
{
keccak256_4way_init( &l2v2_8way_ctx.keccak );
keccak256_4x64_init( &l2v2_8way_ctx.keccak );
cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_8way_ctx.skein );
bmw256_8way_init( &l2v2_8way_ctx.bmw );
skein256_4x64_init( &l2v2_8way_ctx.skein );
bmw256_8x32_init( &l2v2_8way_ctx.bmw );
return true;
}
@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
blake256_8x32_close( &ctx.blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
keccak256_4way_update( &ctx.keccak, vhash, 32 );
keccak256_4way_close( &ctx.keccak, vhash );
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
keccak256_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhash, 32 );
keccak256_4way_close( &ctx.keccak, vhash );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
keccak256_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
skein256_4way_update( &ctx.skein, vhash, 32 );
skein256_4way_close( &ctx.skein, vhash );
skein256_4x64_update( &ctx.skein, vhash, 32 );
skein256_4x64_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhash, 32 );
skein256_4way_close( &ctx.skein, vhash );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhash, 32 );
skein256_4x64_close( &ctx.skein, vhash );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
bmw256_8x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
blake256_8way_init( &l2v2_8way_ctx.blake );
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
blake256_8x32_init( &l2v2_8way_ctx.blake );
blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );
do
{
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV2_4WAY)
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
blake256_4x32_context blake;
keccak256_4x64_context keccak;
cubehashParam cube;
skein256_4way_context skein;
bmw256_4way_context bmw;
skein256_4x64_context skein;
bmw256_4x32_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
bool init_lyra2rev2_4way_ctx()
{
keccak256_4way_init( &l2v2_4way_ctx.keccak );
keccak256_4x64_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
bmw256_4way_init( &l2v2_4way_ctx.bmw );
skein256_4x64_init( &l2v2_4way_ctx.skein );
bmw256_4x32_init( &l2v2_4way_ctx.bmw );
return true;
}
@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
blake256_4x32_close( &ctx.blake, vhash );
rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
keccak256_4x64_close( &ctx.keccak, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way_update( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
skein256_4x64_update( &ctx.skein, vhash64, 32 );
skein256_4x64_close( &ctx.skein, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
bmw256_4x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
blake256_4x32_init( &l2v2_4way_ctx.blake );
blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );
do
{

View File

@@ -9,18 +9,18 @@
#if defined (LYRA2REV3_16WAY)
typedef struct {
blake256_16way_context blake;
blake256_16x32_context blake;
cube_4way_context cube;
bmw256_16way_context bmw;
bmw256_16x32_context bmw;
} lyra2v3_16way_ctx_holder;
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
bool init_lyra2rev3_16way_ctx()
{
blake256_16way_init( &l2v3_16way_ctx.blake );
blake256_16x32_init( &l2v3_16way_ctx.blake );
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
bmw256_16way_init( &l2v3_16way_ctx.bmw );
bmw256_16x32_init( &l2v3_16way_ctx.bmw );
return true;
}
@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
blake256_16x32_close( &ctx.blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
bmw256_16x32_close( &ctx.bmw, state );
}
@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake256_16way_init( &l2v3_16way_ctx.blake );
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
blake256_16x32_init( &l2v3_16way_ctx.blake );
blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );
do
{
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV3_8WAY)
typedef struct {
blake256_8way_context blake;
blake256_8x32_context blake;
cubehashParam cube;
bmw256_8way_context bmw;
bmw256_8x32_context bmw;
} lyra2v3_8way_ctx_holder;
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
bool init_lyra2rev3_8way_ctx()
{
blake256_8way_init( &l2v3_8way_ctx.blake );
blake256_8x32_init( &l2v3_8way_ctx.blake );
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
bmw256_8way_init( &l2v3_8way_ctx.bmw );
bmw256_8x32_init( &l2v3_8way_ctx.bmw );
return true;
}
@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
blake256_8way_close( &ctx.blake, vhash );
blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
blake256_8x32_close( &ctx.blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
bmw256_8x32_close( &ctx.bmw, state );
}
@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
blake256_8way_init( &l2v3_8way_ctx.blake );
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
blake256_8x32_init( &l2v3_8way_ctx.blake );
blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );
do
{
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
#if defined (LYRA2REV3_4WAY)
typedef struct {
blake256_4way_context blake;
blake256_4x32_context blake;
cubehashParam cube;
bmw256_4way_context bmw;
bmw256_4x32_context bmw;
} lyra2v3_4way_ctx_holder;
//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
bool init_lyra2rev3_4way_ctx()
{
blake256_4way_init( &l2v3_4way_ctx.blake );
blake256_4x32_init( &l2v3_4way_ctx.blake );
cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
bmw256_4way_init( &l2v3_4way_ctx.bmw );
bmw256_4x32_init( &l2v3_4way_ctx.bmw );
return true;
}
@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
blake256_4way_close( &ctx.blake, vhash );
blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
blake256_4x32_close( &ctx.blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
bmw256_4x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
blake256_4way_init( &l2v3_4way_ctx.blake );
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
blake256_4x32_init( &l2v3_4way_ctx.blake );
blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );
do
{

View File

@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2z_4way_blake_mid;
static __thread blake256_4x32_context l2z_4way_blake_mid;
void lyra2z_4way_midstate( const void* input )
{
blake256_4way_init( &l2z_4way_blake_mid );
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
blake256_4x32_init( &l2z_4way_blake_mid );
blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
}
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
// blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
/*
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
*/
blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
block_buf[15] = v128_32( 640 );
// Partialy prehash second block without touching nonces
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;

View File

@@ -45,7 +45,7 @@ static const uint64_t blake2b_IV[8] =
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
#define G2W(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
G2W( s0, s1, s2, s3 ); \
s0 = mm512_shufll256_64( s0 ); \
s3 = mm512_swap256_128( s3); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shuflr256_64( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
G2W( s0, s1, s2, s3 ); \
s0 = mm512_shuflr256_64( s0 ); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shufll256_64( s2 );
/*
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shufll256_64( s3 ); \
s1 = mm512_shuflr256_64( s1); \
s2 = mm512_swap256_128( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
*/
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =
#if defined(__AVX2__)
#define G_4X64(a,b,c,d) \
#define G_AVX2(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =
// Pivot about s1 instead of s0 reduces latency.
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
G_AVX2( s0, s1, s2, s3 ); \
s0 = mm256_shufll_64( s0 ); \
s3 = mm256_swap_128( s3); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shuflr_64( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
G_AVX2( s0, s1, s2, s3 ); \
s0 = mm256_shuflr_64( s0 ); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shufll_64( s2 );
/*
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shufll_64( s3 ); \
s1 = mm256_shuflr_64( s1); \
s2 = mm256_swap_128( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
*/
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,7 +124,7 @@ static const uint64_t blake2b_IV[8] =
// process 2 columns in parallel
// returns void, all args updated
#define G_2X64(a,b,c,d) \
#define G_128(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
@@ -161,16 +137,16 @@ static const uint64_t blake2b_IV[8] =
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
v128u64_t t; \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
G_128( s0, s2, s4, s6 ); \
G_128( s1, s3, s5, s7 ); \
t = v128_alignr64( s7, s6, 1 ); \
s6 = v128_alignr64( s6, s7, 1 ); \
s7 = t; \
t = v128_alignr64( s2, s3, 1 ); \
s2 = v128_alignr64( s3, s2, 1 ); \
s3 = t; \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
G_128( s0, s2, s5, s6 ); \
G_128( s1, s3, s4, s7 ); \
t = v128_alignr64( s6, s7, 1 ); \
s6 = v128_alignr64( s7, s6, 1 ); \
s7 = t; \

View File

@@ -18,11 +18,14 @@ typedef struct {
} panama_4way_context __attribute__ ((aligned (64)));
void panama_4way_init( void *cc );
void panama_4way_update( void *cc, const void *data, size_t len );
void panama_4way_close( void *cc, void *dst );
#define panama_4x32_context panama_4way_context
#define panama_4x32_init panama_4way_init
#define panama_4x32_update panama_4way_update
#define panama_4x32_close panama_4way_close
#if defined(__AVX2__)
typedef struct {
@@ -34,10 +37,13 @@ typedef struct {
} panama_8way_context __attribute__ ((aligned (128)));
void panama_8way_init( void *cc );
void panama_8way_update( void *cc, const void *data, size_t len );
void panama_8way_close( void *cc, void *dst );
#define panama_8x32_context panama_8way_context
#define panama_8x32_init panama_8way_init
#define panama_8x32_update panama_8way_update
#define panama_8x32_close panama_8way_close
#endif
#endif

View File

@@ -31,20 +31,20 @@
union _hmq1725_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -81,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
__m512i* vhB = (__m512i*)vhashB;
__m512i* vhC = (__m512i*)vhashC;
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -141,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
if ( ( vh_mask & 0xff ) != 0xff )
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
// B
if ( vh_mask & 0xff )
bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -176,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
{
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhashA );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhashA );
}
if ( likely( vh_mask & 0xff ) )
{
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhashB );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhashB );
}
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -251,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
{
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhash, 64 );
haval256_8x32_close( &ctx.haval, vhash );
memset( &vhash[8<<3], 0, 32<<3 );
rintrlv_8x32_8x64( vhashB, vhash, 512 );
}
@@ -296,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
#endif
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
@@ -351,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -429,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
rintrlv_8x64_8x32( vhashA, vhash, 512 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhashA, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -474,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
{
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhashB );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhashB );
}
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -509,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
#endif
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
@@ -522,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
{
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhash, 64 );
haval256_8x32_close( &ctx.haval, vhash );
memset( &vhash[8<<3], 0, 32<<3 );
rintrlv_8x32_8x64( vhashA, vhash, 512 );
}
@@ -551,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
hash7 );
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, state );
bmw512_8x64_init( &ctx.bmw );
bmw512_8x64_update( &ctx.bmw, vhash, 64 );
bmw512_8x64_close( &ctx.bmw, state );
}
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
@@ -605,12 +605,12 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
union _hmq1725_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa2;
cubehashParam cube;
@@ -620,12 +620,12 @@ union _hmq1725_4way_context_overlay
shavite512_2way_context shavite2;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
#if defined(__VAES__)
groestl512_2way_context groestl2;
echo_2way_context echo2;
@@ -652,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, input, 80 );
bmw512_4x64_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -686,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
// B
if ( h_mask & 0xffffffff )
skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
// second fork, A = blake parallel, B= bmw parallel.
@@ -704,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
h_mask = _mm256_movemask_epi8( vh_mask );
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );
if ( h_mask & 0xffffffff )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -733,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhashA );
}
if ( h_mask & 0xffffffff )
{
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -778,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
// B
if ( h_mask & 0xffffffff )
{
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhash, 64 );
haval256_4x32_close( &ctx.haval, vhash );
memset( &vhash[8<<2], 0, 32<<2 );
rintrlv_4x32_4x64( vhashB, vhash, 512 );
}
@@ -813,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
#endif
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -845,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -890,9 +890,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -921,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( h_mask & 0xffffffff )
{
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhashB );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -950,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
#endif
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
// A = haval parallel, B = Whirlpool serial
@@ -964,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
{
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhash, 64 );
haval256_4x32_close( &ctx.haval, vhash );
memset( &vhash[8<<2], 0, 32<<2 );
rintrlv_4x32_4x64( vhashA, vhash, 512 );
}
@@ -984,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, state );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, state );
}
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,

View File

@@ -13,7 +13,7 @@
#if defined(LBRY_16WAY)
static __thread sha256_16way_context sha256_16w_mid;
static __thread sha256_16x32_context sha256_16w_mid;
void lbry_16way_hash( void* output, const void* input )
{
@@ -36,17 +36,17 @@ void lbry_16way_hash( void* output, const void* input )
uint32_t _ALIGN(64) h13[32];
uint32_t _ALIGN(64) h14[32];
uint32_t _ALIGN(64) h15[32];
sha256_16way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_8way_context ctx_sha512;
ripemd160_16way_context ctx_ripemd;
sha256_16x32_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_8x64_context ctx_sha512;
ripemd160_16x32_context ctx_ripemd;
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
sha256_16x32_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16x32_init( &ctx_sha256 );
sha256_16x32_update( &ctx_sha256, vhashA, 32 );
sha256_16x32_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
@@ -54,13 +54,13 @@ void lbry_16way_hash( void* output, const void* input )
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashA, 32 );
sha512_8way_close( &ctx_sha512, vhashA );
sha512_8x64_init( &ctx_sha512 );
sha512_8x64_update( &ctx_sha512, vhashA, 32 );
sha512_8x64_close( &ctx_sha512, vhashA );
sha512_8way_init( &ctx_sha512 );
sha512_8way_update( &ctx_sha512, vhashB, 32 );
sha512_8way_close( &ctx_sha512, vhashB );
sha512_8x64_init( &ctx_sha512 );
sha512_8x64_update( &ctx_sha512, vhashB, 32 );
sha512_8x64_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
@@ -68,22 +68,22 @@ void lbry_16way_hash( void* output, const void* input )
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
ripemd160_16way_close( &ctx_ripemd, vhashB );
ripemd160_16x32_init( &ctx_ripemd );
ripemd160_16x32_update( &ctx_ripemd, vhashA, 32 );
ripemd160_16x32_close( &ctx_ripemd, vhashB );
ripemd160_16way_init( &ctx_ripemd );
ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
ripemd160_16way_close( &ctx_ripemd, vhashC );
ripemd160_16x32_init( &ctx_ripemd );
ripemd160_16x32_update( &ctx_ripemd, vhashA+(8<<4), 32 );
ripemd160_16x32_close( &ctx_ripemd, vhashC );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashB, 20 );
sha256_16way_update( &ctx_sha256, vhashC, 20 );
sha256_16way_close( &ctx_sha256, vhashA );
sha256_16x32_init( &ctx_sha256 );
sha256_16x32_update( &ctx_sha256, vhashB, 20 );
sha256_16x32_update( &ctx_sha256, vhashC, 20 );
sha256_16x32_close( &ctx_sha256, vhashA );
sha256_16way_init( &ctx_sha256 );
sha256_16way_update( &ctx_sha256, vhashA, 32 );
sha256_16way_close( &ctx_sha256, output );
sha256_16x32_init( &ctx_sha256 );
sha256_16x32_update( &ctx_sha256, vhashA, 32 );
sha256_16x32_close( &ctx_sha256, output );
}
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
@@ -115,8 +115,8 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
sha256_16way_init( &sha256_16w_mid );
sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
sha256_16x32_init( &sha256_16w_mid );
sha256_16x32_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
do
{
@@ -144,7 +144,7 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
#elif defined(LBRY_8WAY)
static __thread sha256_8way_context sha256_8w_mid;
static __thread sha256_8x32_context sha256_8w_mid;
void lbry_8way_hash( void* output, const void* input )
{
@@ -159,52 +159,52 @@ void lbry_8way_hash( void* output, const void* input )
uint32_t _ALIGN(32) h5[32];
uint32_t _ALIGN(32) h6[32];
uint32_t _ALIGN(32) h7[32];
sha256_8way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4way_context ctx_sha512;
ripemd160_8way_context ctx_ripemd;
sha256_8x32_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4x64_context ctx_sha512;
ripemd160_8x32_context ctx_ripemd;
memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
sha256_8way_close( &ctx_sha256, vhashA );
sha256_8x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
sha256_8x32_close( &ctx_sha256, vhashA );
sha256_8way_init( &ctx_sha256 );
sha256_8way_update( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, vhashA );
sha256_8x32_init( &ctx_sha256 );
sha256_8x32_update( &ctx_sha256, vhashA, 32 );
sha256_8x32_close( &ctx_sha256, vhashA );
// reinterleave to do sha512 4-way 64 bit twice.
dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way_update( &ctx_sha512, vhashA, 32 );
sha512_4way_close( &ctx_sha512, vhashA );
sha512_4x64_init( &ctx_sha512 );
sha512_4x64_update( &ctx_sha512, vhashA, 32 );
sha512_4x64_close( &ctx_sha512, vhashA );
sha512_4way_init( &ctx_sha512 );
sha512_4way_update( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
sha512_4x64_init( &ctx_sha512 );
sha512_4x64_update( &ctx_sha512, vhashB, 32 );
sha512_4x64_close( &ctx_sha512, vhashB );
// back to 8-way 32 bit
dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
ripemd160_8way_close( &ctx_ripemd, vhashB );
ripemd160_8x32_init( &ctx_ripemd );
ripemd160_8x32_update( &ctx_ripemd, vhashA, 32 );
ripemd160_8x32_close( &ctx_ripemd, vhashB );
ripemd160_8way_init( &ctx_ripemd );
ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
ripemd160_8way_close( &ctx_ripemd, vhashC );
ripemd160_8x32_init( &ctx_ripemd );
ripemd160_8x32_update( &ctx_ripemd, vhashA+(8<<3), 32 );
ripemd160_8x32_close( &ctx_ripemd, vhashC );
sha256_8way_init( &ctx_sha256 );
sha256_8way_update( &ctx_sha256, vhashB, 20 );
sha256_8way_update( &ctx_sha256, vhashC, 20 );
sha256_8way_close( &ctx_sha256, vhashA );
sha256_8x32_init( &ctx_sha256 );
sha256_8x32_update( &ctx_sha256, vhashB, 20 );
sha256_8x32_update( &ctx_sha256, vhashC, 20 );
sha256_8x32_close( &ctx_sha256, vhashA );
sha256_8way_init( &ctx_sha256 );
sha256_8way_update( &ctx_sha256, vhashA, 32 );
sha256_8way_close( &ctx_sha256, output );
sha256_8x32_init( &ctx_sha256 );
sha256_8x32_update( &ctx_sha256, vhashA, 32 );
sha256_8x32_close( &ctx_sha256, output );
}
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
@@ -235,8 +235,8 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );
sha256_8way_init( &sha256_8w_mid );
sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
sha256_8x32_init( &sha256_8w_mid );
sha256_8x32_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
do
{

View File

@@ -57,7 +57,7 @@ do{ \
#define ROUND2(a, b, c, d, e, f, s, r, k) \
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_4way_round( ripemd160_4way_context *sc )
static void ripemd160_4x32_round( ripemd160_4x32_context *sc )
{
const __m128i *in = (__m128i*)sc->buf;
__m128i *h = (__m128i*)sc->val;
@@ -249,7 +249,7 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
h[0] = tmp;
}
void ripemd160_4way_init( ripemd160_4way_context *sc )
void ripemd160_4x32_init( ripemd160_4x32_context *sc )
{
sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -259,7 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
sc->count_high = sc->count_low = 0;
}
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
size_t len )
{
__m128i *vdata = (__m128i*)data;
@@ -281,7 +281,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
len -= clen;
if ( ptr == block_size )
{
ripemd160_4way_round( sc );
ripemd160_4x32_round( sc );
ptr = 0;
}
clow = sc->count_low;
@@ -292,7 +292,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
}
}
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
@@ -306,7 +306,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_4way_round( sc );
ripemd160_4x32_round( sc );
memset_zero_128( sc->buf, pad>>2 );
}
else
@@ -317,7 +317,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
low = low << 3;
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
ripemd160_4x32_round( sc );
for (u = 0; u < 5; u ++)
casti_v128u32( dst, u ) = sc->val[u];
}
@@ -357,7 +357,7 @@ do{ \
#define ROUND2_8W(a, b, c, d, e, f, s, r, k) \
RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_8way_round( ripemd160_8way_context *sc )
static void ripemd160_8x32_round( ripemd160_8x32_context *sc )
{
const __m256i *in = (__m256i*)sc->buf;
__m256i *h = (__m256i*)sc->val;
@@ -550,7 +550,7 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
}
void ripemd160_8way_init( ripemd160_8way_context *sc )
void ripemd160_8x32_init( ripemd160_8x32_context *sc )
{
sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
@@ -560,7 +560,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
sc->count_high = sc->count_low = 0;
}
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
@@ -582,7 +582,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
len -= clen;
if ( ptr == block_size )
{
ripemd160_8way_round( sc );
ripemd160_8x32_round( sc );
ptr = 0;
}
clow = sc->count_low;
@@ -593,7 +593,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
}
}
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
@@ -607,7 +607,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_8way_round( sc );
ripemd160_8x32_round( sc );
memset_zero_256( sc->buf, pad>>2 );
}
else
@@ -618,7 +618,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
low = low << 3;
sc->buf[ pad>>2 ] = _mm256_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
ripemd160_8way_round( sc );
ripemd160_8x32_round( sc );
for (u = 0; u < 5; u ++)
casti_m256i( dst, u ) = sc->val[u];
}
@@ -629,7 +629,6 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
// RIPEMD-160 16 way
#define F16W_1(x, y, z) \
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
@@ -659,7 +658,7 @@ do{ \
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_16way_round( ripemd160_16way_context *sc )
static void ripemd160_16x32_round( ripemd160_16x32_context *sc )
{
const __m512i *in = (__m512i*)sc->buf;
__m512i *h = (__m512i*)sc->val;
@@ -851,7 +850,7 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
h[0] = tmp;
}
void ripemd160_16way_init( ripemd160_16way_context *sc )
void ripemd160_16x32_init( ripemd160_16x32_context *sc )
{
sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
@@ -861,7 +860,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
sc->count_high = sc->count_low = 0;
}
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -883,7 +882,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
len -= clen;
if ( ptr == block_size )
{
ripemd160_16way_round( sc );
ripemd160_16x32_round( sc );
ptr = 0;
}
clow = sc->count_low;
@@ -894,7 +893,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
}
}
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
@@ -908,7 +907,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_16way_round( sc );
ripemd160_16x32_round( sc );
memset_zero_512( sc->buf, pad>>2 );
}
else
@@ -919,7 +918,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
low = low << 3;
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
ripemd160_16way_round( sc );
ripemd160_16x32_round( sc );
for (u = 0; u < 5; u ++)
casti_m512i( dst, u ) = sc->val[u];
}

View File

@@ -12,12 +12,12 @@ typedef struct
__m128i buf[64>>2];
__m128i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (64))) ripemd160_4way_context;
} __attribute__ ((aligned (64))) ripemd160_4x32_context;
void ripemd160_4way_init( ripemd160_4way_context *sc );
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
void ripemd160_4x32_init( ripemd160_4x32_context *sc );
void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
size_t len );
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst );
#if defined (__AVX2__)
@@ -26,12 +26,12 @@ typedef struct
__m256i buf[64>>2];
__m256i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (128))) ripemd160_8way_context;
} __attribute__ ((aligned (128))) ripemd160_8x32_context;
void ripemd160_8way_init( ripemd160_8way_context *sc );
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
void ripemd160_8x32_init( ripemd160_8x32_context *sc );
void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst );
#if defined(SIMD512)
@@ -40,12 +40,12 @@ typedef struct
__m512i buf[64>>2];
__m512i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (128))) ripemd160_16way_context;
} __attribute__ ((aligned (128))) ripemd160_16x32_context;
void ripemd160_16way_init( ripemd160_16way_context *sc );
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
void ripemd160_16x32_init( ripemd160_16x32_context *sc );
void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
size_t len );
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst );
#endif // AVX512
#endif // __AVX2__

View File

@@ -597,6 +597,45 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__) || defined(__ARM_NEON)
v128_t *V = (v128_t*)v;
#define ROUND( r ) \
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
V[2] = v128_add32( V[2], V[3] ); \
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
V[2] = v128_add32( V[2], V[3] ); \
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
V[0] = v128_shufll32( V[0] ); \
V[3] = v128_swap64( V[3] ); \
V[2] = v128_shuflr32( V[2] ); \
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
V[2] = v128_add32( V[2], V[3] ); \
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
V[2] = v128_add32( V[2], V[3] ); \
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
V[0] = v128_shuflr32( V[0] ); \
V[3] = v128_swap64( V[3] ); \
V[2] = v128_shufll32( V[2] )
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -619,6 +658,9 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
} while(0)
#endif
ROUND(0);
ROUND(1);
ROUND(2);

View File

@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
#endif // SHA
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -339,7 +336,7 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
};
*/
static inline void sha256_4way_init_state( void *state )
static inline void sha256_4x32_init_state( void *state )
{
casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
@@ -362,21 +359,21 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
memcpy( pad, key + 4*16, 4*16 );
memcpy( pad + 4*4, keypad_4way, 4*48 );
sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
sha256_4x32_transform_le( (v128_t*)ihash, (v128_t*)pad,
(const v128_t*)tstate );
sha256_4way_init_state( tstate );
sha256_4x32_init_state( tstate );
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c;
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)pad,
(const v128_t*)tstate );
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
for ( ; i < 4*16; i++ ) pad[i] = 0x36363636;
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)pad,
(const v128_t*)tstate );
}
@@ -389,7 +386,7 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
uint32_t _ALIGN(16) obuf[4 * 16];
int i, j;
sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
sha256_4x32_transform_le( (v128_t*)istate, (v128_t*)salt,
(const v128_t*)tstate );
memcpy(ibuf, salt + 4 * 16, 4 * 16);
@@ -403,10 +400,10 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
ibuf[4 * 4 + 2] = i + 1;
ibuf[4 * 4 + 3] = i + 1;
sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
sha256_4x32_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
(const v128_t*)istate );
sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
sha256_4x32_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
(const v128_t*)ostate );
for ( j = 0; j < 4 * 8; j++ )
@@ -421,9 +418,9 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
uint32_t _ALIGN(64) buf[4 * 16];
int i;
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)salt,
(const v128_t*)tstate );
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
(const v128_t*)tstate );
final[ 0] = v128_32( 0x00000001 );
@@ -434,20 +431,20 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
= v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
final[15] = v128_32 ( 0x00000620 );
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)final,
(const v128_t*)tstate );
memcpy(buf, tstate, 4 * 32);
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)buf,
(const v128_t*)ostate );
for ( i = 0; i < 4 * 8; i++ )
output[i] = bswap_32( ostate[i] );
}
#ifdef HAVE_SHA256_8WAY
#if defined(__AVX2__)
/*
static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
@@ -470,7 +467,7 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
};
*/
static inline void sha256_8way_init_state( void *state )
static inline void sha256_8x32_init_state( void *state )
{
casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
@@ -494,21 +491,21 @@ static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
memset( pad + 8*5, 0x00, 8*40 );
for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280;
sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
sha256_8x32_transform_le( (__m256i*)ihash, (__m256i*)pad,
(const __m256i*)tstate );
sha256_8way_init_state( tstate );
sha256_8x32_init_state( tstate );
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c;
sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)pad,
(const __m256i*)tstate );
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
for ( ; i < 8*16; i++ ) pad[i] = 0x36363636;
sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
sha256_8x32_transform_le( (__m256i*)tstate, (__m256i*)pad,
(const __m256i*)tstate );
}
@@ -521,7 +518,7 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
uint32_t _ALIGN(32) obuf[8 * 16];
int i, j;
sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
sha256_8x32_transform_le( (__m256i*)istate, (__m256i*)salt,
(const __m256i*)tstate );
memcpy( ibuf, salt + 8*16, 8*16 );
@@ -544,10 +541,10 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
ibuf[8 * 4 + 6] = i + 1;
ibuf[8 * 4 + 7] = i + 1;
sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
sha256_8x32_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
(const __m256i*)istate );
sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
sha256_8x32_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
(const __m256i*)ostate );
for ( j = 0; j < 8*8; j++ )
@@ -562,9 +559,9 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
uint32_t _ALIGN(128) buf[ 8*16 ];
int i;
sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)salt,
(const __m256i*)tstate );
sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
(const __m256i*)tstate );
final[ 0] = _mm256_set1_epi32( 0x00000001 );
@@ -575,7 +572,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
= _mm256_setzero_si256();
final[15] = _mm256_set1_epi32 ( 0x00000620 );
sha256_8way_transform_le( (__m256i*)tstate, final,
sha256_8x32_transform_le( (__m256i*)tstate, final,
(const __m256i*)tstate );
memcpy( buf, tstate, 8*32 );
@@ -583,18 +580,18 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
memset( buf + 8*9, 0x00, 8*24 );
for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300;
sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)buf,
(const __m256i*)ostate );
for (i = 0; i < 8 * 8; i++)
output[i] = bswap_32(ostate[i]);
}
#endif /* HAVE_SHA256_8WAY */
#endif //AVX2
#if defined(SIMD512)
static inline void sha256_16way_init_state( void *state )
static inline void sha256_16x32_init_state( void *state )
{
casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
@@ -618,21 +615,21 @@ static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
memset( pad + 16*5, 0x00, 16*40 );
for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280;
sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
sha256_16x32_transform_le( (__m512i*)ihash, (__m512i*)pad,
(const __m512i*)tstate );
sha256_16way_init_state( tstate );
sha256_16x32_init_state( tstate );
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c;
sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)pad,
(const __m512i*)tstate );
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
for ( ; i < 16*16; i++ ) pad[i] = 0x36363636;
sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
sha256_16x32_transform_le( (__m512i*)tstate, (__m512i*)pad,
(const __m512i*)tstate );
}
@@ -645,7 +642,7 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
uint32_t _ALIGN(128) ostate2[ 16*8 ];
int i, j;
sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
sha256_16x32_transform_le( (__m512i*)istate, (__m512i*)salt,
(const __m512i*)tstate );
memcpy( ibuf, salt + 16*16, 16*16 );
@@ -676,10 +673,10 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
ibuf[ 16*4 + 14 ] = i + 1;
ibuf[ 16*4 + 15 ] = i + 1;
sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
sha256_16x32_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
(const __m512i*)istate );
sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
sha256_16x32_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
(const __m512i*)ostate );
for ( j = 0; j < 16*8; j++ )
@@ -694,9 +691,9 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
uint32_t _ALIGN(128) buf[ 16*16 ];
int i;
sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)salt,
(const __m512i*)tstate );
sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
(const __m512i*)tstate );
final[ 0] = _mm512_set1_epi32( 0x00000001 );
@@ -707,7 +704,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
= _mm512_setzero_si512();
final[15] = _mm512_set1_epi32 ( 0x00000620 );
sha256_16way_transform_le( (__m512i*)tstate, final,
sha256_16x32_transform_le( (__m512i*)tstate, final,
(const __m512i*)tstate );
memcpy( buf, tstate, 16*32 );
@@ -715,7 +712,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
memset( buf + 16*9, 0x00, 16*24 );
for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300;
sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)buf,
(const __m512i*)ostate );
for ( i = 0; i < 16*8; i++ )
@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
#endif // AVX512
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#if defined(__AVX2__)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
#define HAVE_SCRYPT_6WAY 1
void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#endif
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
#endif
#include "scrypt-core-4way.h"
/*
#if ( SCRYPT_THROUGHPUT == 1 )
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thr_id )
{
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
memcpy(tstate, midstate, 32);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core_simd128( X, scratchbuf, N ); // woring
// scrypt_core_1way( X, V, N ); // working
// scrypt_core(X, V, N);
scrypt_core_1way( X, scratchbuf, N );
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
return true;
}
*/
#endif
#if ( SCRYPT_THROUGHPUT == 8 )
@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
********************/
/*
scrypt_core_3way( X, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+ 96, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+192, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+256, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_3way( X+352, V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_2buf( X+448, V, N );
*/
if ( work_restart[thrid].restart ) return 0;
@@ -1321,8 +1286,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
return 1;
}
#else
// SSE2
#elif defined(__SSE2__) || defined(__ARM_NEON)
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thrid )
@@ -1481,7 +1445,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
@@ -1491,31 +1455,31 @@ bool register_scrypt_algo( algo_gate_t* gate )
opt_param_n = opt_param_n ? opt_param_n : 1024;
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
// scrypt_throughput = 2;
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
#elif defined(__AVX2__)
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#else
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#endif
switch ( SCRYPT_THROUGHPUT )
{
case 16: // AVX512
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
break;
case 2: // SHA256
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
break;
case 8: // AVX2
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
break;
case 4: // SSE2, NEON
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
default:
scratchbuf_size = opt_param_n; // 1 way
}
char t_units[4] = {0};
char d_units[4] = {0};

View File

@@ -31,7 +31,7 @@
#include "hmac-sha256-hash-4way.h"
#include "compat.h"
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
// HMAC 4-way SSE2
/**
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_4way_init( &ctx->ictx );
sha256_4way_update( &ctx->ictx, K, Klen );
sha256_4way_close( &ctx->ictx, khash );
sha256_4x32_init( &ctx->ictx );
sha256_4x32_update( &ctx->ictx, K, Klen );
sha256_4x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_4way_init( &ctx->ictx );
sha256_4x32_init( &ctx->ictx );
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
sha256_4x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_4way_init( &ctx->octx );
sha256_4x32_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4x32_update( &ctx->octx, pad, 64 );
}
/* Add bytes to the HMAC-SHA256 operation. */
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_4way_update( &ctx->ictx, in, len );
sha256_4x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
unsigned char ihash[32*4] __attribute__ ((aligned (64)));
/* Finish the inner SHA256 operation. */
sha256_4way_close( &ctx->ictx, ihash );
sha256_4x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_4way_update( &ctx->octx, ihash, 32 );
sha256_4x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_4way_close( &ctx->octx, digest );
sha256_4x32_close( &ctx->octx, digest );
}
/**
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
hmac_sha256_4way_context PShctx, hctx;
uint8_t _ALIGN(128) T[32*4];
uint8_t _ALIGN(128) U[32*4];
__m128i ivec;
v128u32_t ivec;
size_t i, clen;
uint64_t j;
int k;
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
for ( i = 0; i * 32 < dkLen; i++ )
{
/* Generate INT(i + 1). */
ivec = _mm_set1_epi32( bswap_32( i+1 ) );
ivec = v128_32( bswap_32( i+1 ) );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_8way_init( &ctx->ictx );
sha256_8way_update( &ctx->ictx, K, Klen );
sha256_8way_close( &ctx->ictx, khash );
sha256_8x32_init( &ctx->ictx );
sha256_8x32_update( &ctx->ictx, K, Klen );
sha256_8x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_8way_init( &ctx->ictx );
sha256_8x32_init( &ctx->ictx );
memset( pad, 0x36, 64*8);
for ( i = 0; i < Klen/4; i++ )
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
casti_m256i( K, i ) );
sha256_8way_update( &ctx->ictx, pad, 64 );
sha256_8x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_8way_init( &ctx->octx );
sha256_8x32_init( &ctx->octx );
memset( pad, 0x5c, 64*8 );
for ( i = 0; i < Klen/4; i++ )
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
casti_m256i( K, i ) );
sha256_8way_update( &ctx->octx, pad, 64 );
sha256_8x32_update( &ctx->octx, pad, 64 );
}
void
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_8way_update( &ctx->ictx, in, len );
sha256_8x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
unsigned char ihash[32*8] __attribute__ ((aligned (128)));
/* Finish the inner SHA256 operation. */
sha256_8way_close( &ctx->ictx, ihash );
sha256_8x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_8way_update( &ctx->octx, ihash, 32 );
sha256_8x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_8way_close( &ctx->octx, digest );
sha256_8x32_close( &ctx->octx, digest );
}
/**
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_16way_init( &ctx->ictx );
sha256_16way_update( &ctx->ictx, K, Klen );
sha256_16way_close( &ctx->ictx, khash );
sha256_16x32_init( &ctx->ictx );
sha256_16x32_update( &ctx->ictx, K, Klen );
sha256_16x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_16way_init( &ctx->ictx );
sha256_16x32_init( &ctx->ictx );
memset( pad, 0x36, 64*16 );
for ( i = 0; i < Klen; i++ )
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
casti_m512i( K, i ) );
sha256_16way_update( &ctx->ictx, pad, 64 );
sha256_16x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_16way_init( &ctx->octx );
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
for ( i = 0; i < Klen/4; i++ )
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
casti_m512i( K, i ) );
sha256_16way_update( &ctx->octx, pad, 64 );
sha256_16x32_update( &ctx->octx, pad, 64 );
}
void
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_16way_update( &ctx->ictx, in, len );
sha256_16x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
unsigned char ihash[32*16] __attribute__ ((aligned (128)));
/* Finish the inner SHA256 operation. */
sha256_16way_close( &ctx->ictx, ihash );
sha256_16x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_16way_update( &ctx->octx, ihash, 32 );
sha256_16x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_16way_close( &ctx->octx, digest );
sha256_16x32_close( &ctx->octx, digest );
}
/**

View File

@@ -1,6 +1,6 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* Copyright 2020 JayDDee@gmailcom
* Copyright 2020 JayDDee246@gmailcom
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,12 @@
#include "simd-utils.h"
#include "sha256-hash.h"
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct _hmac_sha256_4way_context
{
sha256_4way_context ictx;
sha256_4way_context octx;
sha256_4x32_context ictx;
sha256_4x32_context octx;
} hmac_sha256_4way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
typedef struct _hmac_sha256_8way_context
{
sha256_8way_context ictx;
sha256_8way_context octx;
sha256_8x32_context ictx;
sha256_8x32_context octx;
} hmac_sha256_8way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
typedef struct _hmac_sha256_16way_context
{
sha256_16way_context ictx;
sha256_16way_context octx;
sha256_16x32_context ictx;
sha256_16x32_context octx;
} hmac_sha256_16way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );

View File

@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#if defined(__SSE2__) || defined(__ARM_NEON)
// SHA-256 4 way SSE2
#define CHs(X, Y, Z) \
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
v128_store( state_out + 7, H );
}
# if 0
// Working correctly but still slower
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target )
{
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );
C = v128_load( state_in+2 );
D = v128_load( state_in+3 );
E = v128_load( state_in+4 );
F = v128_load( state_in+5 );
G = v128_load( state_in+6 );
H = v128_load( state_in+7 );
const v128_t IV7 = H;
const v128_t IV6 = G;
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 );
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 );
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
T0 = v128_add32( v128_32( K256[58] ),
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = v128_add32( B, T0 );
T1 = v128_add32( v128_32( K256[59] ),
v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = v128_add32( A, T1 );
T2 = v128_add32( v128_32( K256[60] ),
v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = v128_add32( H, T2 );
targ = v128_32( target[7] );
hash = v128_bswap32( v128_add32( H, IV7 ) );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( likely(
0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
return 0;
t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
// round 58 part 2
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
// round 61 part 1
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = v128_add32( v128_32( K256[61] ),
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = v128_add32( G, T0 );
if ( t6_mask )
{
targ = v128_and( vmask, v128_32( target[6] ) );
hash = v128_bswap32( v128_add32( G, IV6 ) );
if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
return 0;
else
{
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
v128_cmpgt32( hash, targ ) ) ) ) )
return 0;
else if ( target[6] == 0x80000000 )
{
if ( 0 == ( t6_mask & v128_movmask32(
v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
return 0;
}
}
}
// rounds 59 to 61 part 2
E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
// rounds 62 & 63
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
state_out[0] = v128_add32( state_in[0], A );
state_out[1] = v128_add32( state_in[1], B );
state_out[2] = v128_add32( state_in[2], C );
state_out[3] = v128_add32( state_in[3], D );
state_out[4] = v128_add32( state_in[4], E );
state_out[5] = v128_add32( state_in[5], F );
state_out[6] = v128_add32( state_in[6], G );
state_out[7] = v128_add32( state_in[7], H );
return 1;
}
#endif
void sha256_4x32_init( sha256_4x32_context *sc )
{
sc->count_high = sc->count_low = 0;
@@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
sha256_4x32_close( &ctx, dst );
}
#endif // SSE2 || NEON
#if defined(__AVX2__)
// SHA-256 8 way
#define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
mm256_ror_32( x, 13 ) ), \
mm256_ror_32( x, 22 ) )
mm256_xor3( mm256_ror_32( x, 2 ), \
mm256_ror_32( x, 13 ), \
mm256_ror_32( x, 22 ) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
mm256_ror_32( x, 11 ) ), \
mm256_ror_32( x, 25 ) )
mm256_xor3( mm256_ror_32( x, 6 ), \
mm256_ror_32( x, 11 ), \
mm256_ror_32( x, 25 ) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
mm256_ror_32( x, 18 ) ), \
_mm256_srli_epi32( x, 3 ) )
mm256_xor3( mm256_ror_32( x, 7 ), \
mm256_ror_32( x, 18 ), \
_mm256_srli_epi32( x, 3 ) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
mm256_ror_32( x, 19 ) ), \
_mm256_srli_epi32( x, 10 ) )
mm256_xor3( mm256_ror_32( x, 17 ), \
mm256_ror_32( x, 19 ), \
_mm256_srli_epi32( x, 10 ) )
#define SHA256_8WAY_MEXP( a, b, c, d ) \
mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
@@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
// With AVX512VL ternary logic optimizations are available.
// If not optimize by forwarding the result of X^Y in MAJ to the next round
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(VL256)
// AVX512 or AVX10-256
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
}
// accepts LE input data
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i W[16];
@@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
}
// Accepts BE input data, need to bswap
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i W[16];
@@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
}
// Aggressive prehashing, LE byte order
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
const __m256i *W, const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H, T1;
@@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
_mm256_store_si256( state_mid + 7, H );
}
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
{
__m256i A, B, C, D, E, F, G, H;
@@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
_mm256_store_si256( state_out + 7, H );
}
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target )
{
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
__m256i vmask, targ, hash;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
uint8_t flip, t6_mask;
A = _mm256_load_si256( state_in );
@@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
// Got H, test it.
targ = v256_32( target[7] );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
if ( target[7] )
{
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
@@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
{
// Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
@@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
return 1;
}
void sha256_8way_init( sha256_8way_context *sc )
void sha256_8x32_init( sha256_8x32_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = v256_32( sha256_iv[0] );
@@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc )
// need to handle odd byte length for yespower.
// Assume only last update is odd.
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
@@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
}
}
void sha256_8way_close( sha256_8way_context *sc, void *dst )
void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
@@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
memset_zero_256( sc->buf, pad >> 2 );
}
else
@@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
mm256_block_bswap_32( dst, sc->val );
}
void sha256_8way_full( void *dst, const void *data, size_t len )
void sha256_8x32_full( void *dst, const void *data, size_t len )
{
sha256_8way_context ctx;
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, data, len );
sha256_8way_close( &ctx, dst );
sha256_8x32_context ctx;
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, data, len );
sha256_8x32_close( &ctx, dst );
}
#if defined(SIMD512)
@@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
}
// accepts LE input data
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i W[16];
@@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
}
// Accepts BE input data, need to bswap
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i W[16];
@@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
}
// Aggressive prehashing, LE byte order
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X,
const __m512i *W, const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H, T1;
@@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
_mm512_store_si512( state_mid + 7, H );
}
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
{
__m512i A, B, C, D, E, F, G, H;
@@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
// returns 0 if hash aborted early and invalid,
// returns 1 for completed hash with at least one valid candidate.
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target )
{
__m512i A, B, C, D, E, F, G, H, hash, targ;
__m512i T0, T1, T2;
__m512i W[16]; memcpy_512( W, data, 16 );
__mmask16 t6_mask;
const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in+1 );
@@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
H = _mm512_add_epi32( H, T2 );
// got H, test it against target[7]
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
targ = v512_32( target[7] );
if ( target[7] )
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
@@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// got G, test it against target[6] if indicated
if ( (uint16_t)t6_mask )
{
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
targ = v512_32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
return 0;
@@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
return 1;
}
void sha256_16way_init( sha256_16way_context *sc )
void sha256_16x32_init( sha256_16x32_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = v512_32( sha256_iv[0] );
@@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc )
sc->val[7] = v512_32( sha256_iv[7] );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
len -= clen;
if ( ptr == buf_size )
{
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
}
}
void sha256_16way_close( sha256_16way_context *sc, void *dst )
void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
@@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
memset_zero_512( sc->buf, pad >> 2 );
}
else
@@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
mm512_block_bswap_32( dst, sc->val );
}
void sha256_16way_full( void *dst, const void *data, size_t len )
void sha256_16x32_full( void *dst, const void *data, size_t len )
{
sha256_16way_context ctx;
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, data, len );
sha256_16way_close( &ctx, dst );
sha256_16x32_context ctx;
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, data, len );
sha256_16x32_close( &ctx, dst );
}
#undef CH

View File

@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target );
// Temporary API during naming transition
#define sha256_8way_context sha256_8x32_context
#define sha256_8way_init sha256_8x32_init
#define sha256_8way_update sha256_8x32_update
#define sha256_8way_close sha256_8x32_close
#define sha256_8way_full sha256_8x32_full
#define sha256_8way_transform_le sha256_8x32_transform_le
#define sha256_8way_transform_be sha256_8x32_transform_be
#define sha256_8way_prehash_3rounds sha256_8x32_prehash_3rounds
#define sha256_8way_final_rounds sha256_8x32_final_rounds
#define sha256_8way_transform_le_short sha256_8x32_transform_le_short
#endif // AVX2
#if defined(__SSE2__) || defined(__ARM_NEON)
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target );
// Temporary API during naming transition
#define sha256_4way_context sha256_4x32_context
#define sha256_4way_init sha256_4x32_init
#define sha256_4way_update sha256_4x32_update
#define sha256_4way_close sha256_4x32_close
#define sha256_4way_full sha256_4x32_full
#define sha256_4way_transform_le sha256_4x32_transform_le
#define sha256_4way_transform_be sha256_4x32_transform_be
#define sha256_4way_prehash_3rounds sha256_4x32_prehash_3rounds
#define sha256_4way_final_rounds sha256_4x32_final_rounds
#define sha256_4way_transform_le_short sha256_4x32_transform_le_short
#endif
#endif // SSE2 || NEON
#endif // SHA256_HASH_H__

View File

@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_transform_le( mstatea, pdata, sha256_iv );
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) =
_mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
casti_v128( hasha, 1 ) =
_mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_v128( hashb, 0 ) =
_mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
casti_v128( hashb, 1 ) =
_mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256_iv );
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = v512_32( sha256_iv[0] );
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16way_transform_le_short(
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16x32_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 16; lane++ )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
sha256_8x32_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
istate, ptarget ) ) )
{
for ( int lane = 0; lane < 8; lane++ )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;

View File

@@ -12,7 +12,7 @@
#define SHA256D_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA256D_4WAY 1
#endif

View File

@@ -17,7 +17,6 @@
#elif defined (__SSE2__) || defined(__ARM_NEON)
#define SHA256DT_4X32 1
#endif
// else ref, should never happen
static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
{
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256dt_iv );
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
for ( int lane = 0; lane < 16; lane++ )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
do
{
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
// sha256_4x32_transform_le( block, vdata+16, mhash1 );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )

View File

@@ -7,28 +7,28 @@
#if defined(SHA256T_16WAY)
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));
void sha256q_16way_hash( void* output, const void* input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
sha256_16way_context ctx;
sha256_16x32_context ctx;
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
sha256_16way_update( &ctx, input + (64<<4), 16 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_update( &ctx, input + (64<<4), 16 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, output );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, output );
}
int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
sha256_16way_init( &sha256_ctx16 );
sha256_16way_update( &sha256_ctx16, vdata, 64 );
sha256_16x32_init( &sha256_ctx16 );
sha256_16x32_update( &sha256_ctx16, vdata, 64 );
do
{
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));
void sha256q_8way_hash( void* output, const void* input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
sha256_8way_context ctx;
sha256_8x32_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way_update( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_update( &ctx, input + (64<<3), 16 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, output );
}
int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
sha256_8way_init( &sha256_ctx8 );
sha256_8way_update( &sha256_ctx8, vdata, 64 );
sha256_8x32_init( &sha256_ctx8 );
sha256_8x32_update( &sha256_ctx8, vdata, 64 );
do
{
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));
void sha256q_4way_hash( void* output, const void* input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx;
sha256_4x32_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way_update( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_update( &ctx, input + (64<<2), 16 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, output );
}
int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
0 };
v128_bswap32_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
sha256_4x32_init( &sha256_ctx4 );
sha256_4x32_update( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{

View File

@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256_iv );
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = v512_32( sha256_iv[0] );
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
sha256_16way_transform_le( block, block, istate );
sha256_16x32_transform_le( block, block, istate );
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
{
for ( int lane = 0; lane < 16; lane++ )
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
sha256_8x32_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_8way_transform_le( block, block, istate );
sha256_8x32_transform_le( block, block, istate );
// 3. 32 byte hash from 2.
if ( unlikely( sha256_8way_transform_le_short(
if ( unlikely( sha256_8x32_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 8; lane++ )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
do
{
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
sha256_4way_transform_le( block, block, iv );
sha256_4way_transform_le( hash32, block, iv );
sha256_4x32_transform_le( block, block, iv );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )
{

View File

@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i MSG, TMP;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
TMSG0 = mm256_bswap_64( TMSG0 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128 (MSG ) );
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
TMSG1 = mm256_bswap_64( TMSG1 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
TMSG2 = mm256_bswap_64( TMSG2 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
TMSG3 = mm256_bswap_64( TMSG3 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
else
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
v512_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
v512_64( sc->count << 3 ), shuff_bswap64 );
sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count << 3 ) );
sha512_8x64_round( sc, sc->buf, sc->val );
mm512_block_bswap_64( dst, sc->val );
@@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
@@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
v256_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
v256_64( sc->count << 3 ), shuff_bswap64 );
sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count << 3 ) );
sha512_4x64_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );
@@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
else
v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
sha512_2x64_round( sc, sc->buf, sc->val );
v128_block_bswap64( castp_v128u64( dst ), sc->val );

View File

@@ -36,7 +36,6 @@ typedef struct
uint64_t count;
bool initialized;
} sha512_8x64_context __attribute__ ((aligned (128)));
#define sha512_8way_context sha512_8x64_context
void sha512_8x64_init( sha512_8x64_context *sc);
void sha512_8x64_update( sha512_8x64_context *sc, const void *data,
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
size_t len );
#define sha512_8way_init sha512_8x64_init
#define sha512_8way_update sha512_8x64_update
#define sha512_8way_close sha512_8x64_close
#endif // AVX512
#if defined (__AVX2__)
@@ -62,7 +57,6 @@ typedef struct
uint64_t count;
bool initialized;
} sha512_4x64_context __attribute__ ((aligned (64)));
#define sha512_4way_context sha512_4x64_context
void sha512_4x64_init( sha512_4x64_context *sc);
void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
size_t len );
#define sha512_4way_init sha512_4x64_init
#define sha512_4way_update sha512_4x64_update
#define sha512_4way_close sha512_4x64_close
#endif // AVX2
typedef struct

View File

@@ -14,7 +14,7 @@
#if defined(SHA512256D_8WAY)
static void sha512256d_8way_init( sha512_8way_context *ctx )
static void sha512256d_8x64_init( sha512_8x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
{
uint64_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
sha512_8way_context ctx;
sha512_8x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*8]);
uint32_t *pdata = work->data;
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
do
{
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, vdata, 80 );
sha512_8way_close( &ctx, hash );
sha512256d_8x64_init( &ctx );
sha512_8x64_update( &ctx, vdata, 80 );
sha512_8x64_close( &ctx, hash );
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, hash, 32 );
sha512_8way_close( &ctx, hash );
sha512256d_8x64_init( &ctx );
sha512_8x64_update( &ctx, hash, 32 );
sha512_8x64_close( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
#elif defined(SHA512256D_4WAY)
static void sha512256d_4way_init( sha512_4way_context *ctx )
static void sha512256d_4x64_init( sha512_4x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
{
uint64_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
sha512_4way_context ctx;
sha512_4x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*4]);
uint32_t *pdata = work->data;
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, vdata, 80 );
sha512_4way_close( &ctx, hash );
sha512256d_4x64_init( &ctx );
sha512_4x64_update( &ctx, vdata, 80 );
sha512_4x64_close( &ctx, hash );
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, hash, 32 );
sha512_4way_close( &ctx, hash );
sha512256d_4x64_init( &ctx );
sha512_4x64_update( &ctx, hash, 32 );
sha512_4x64_close( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )

View File

@@ -430,9 +430,9 @@ do { \
} while (0)
static void
shabal_16way_init( void *cc, unsigned size )
shabal_16x32_init( void *cc, unsigned size )
{
shabal_16way_context *sc = (shabal_16way_context*)cc;
shabal_16x32_context *sc = (shabal_16x32_context*)cc;
if ( size == 512 )
{ // copy immediate constants directly to working registers later.
@@ -494,9 +494,9 @@ shabal_16way_init( void *cc, unsigned size )
}
static void
shabal_16way_core( void *cc, const unsigned char *data, size_t len )
shabal_16x32_core( void *cc, const unsigned char *data, size_t len )
{
shabal_16way_context *sc = (shabal_16way_context*)cc;
shabal_16x32_context *sc = (shabal_16x32_context*)cc;
__m512i *buf;
__m512i *vdata = (__m512i*)data;
const int buf_size = 64;
@@ -544,10 +544,10 @@ shabal_16way_core( void *cc, const unsigned char *data, size_t len )
}
static void
shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
shabal_16x32_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_16way_context *sc = (shabal_16way_context*)cc;
shabal_16x32_context *sc = (shabal_16x32_context*)cc;
__m512i *buf;
const int buf_size = 64;
size_t ptr;
@@ -590,52 +590,39 @@ shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst,
}
void
shabal256_16way_init( void *cc )
shabal256_16x32_init( void *cc )
{
shabal_16way_init(cc, 256);
shabal_16x32_init(cc, 256);
}
void
shabal256_16way_update( void *cc, const void *data, size_t len )
shabal256_16x32_update( void *cc, const void *data, size_t len )
{
shabal_16way_core( cc, data, len );
shabal_16x32_core( cc, data, len );
}
void
shabal256_16way_close( void *cc, void *dst )
shabal256_16x32_close( void *cc, void *dst )
{
shabal_16way_close(cc, 0, 0, dst, 8);
shabal_16x32_close(cc, 0, 0, dst, 8);
}
void
shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst )
shabal512_16x32_init(void *cc)
{
shabal_16way_close(cc, ub, n, dst, 8);
shabal_16x32_init(cc, 512);
}
void
shabal512_16way_init(void *cc)
shabal512_16x32_update(void *cc, const void *data, size_t len)
{
shabal_16way_init(cc, 512);
shabal_16x32_core(cc, data, len);
}
void
shabal512_16way_update(void *cc, const void *data, size_t len)
shabal512_16x32_close(void *cc, void *dst)
{
shabal_16way_core(cc, data, len);
}
void
shabal512_16way_close(void *cc, void *dst)
{
shabal_16way_close(cc, 0, 0, dst, 16);
}
void
shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shabal_16way_close(cc, ub, n, dst, 16);
shabal_16x32_close(cc, 0, 0, dst, 16);
}
#endif
@@ -1031,9 +1018,9 @@ do { \
} while (0)
static void
shabal_8way_init( void *cc, unsigned size )
shabal_8x32_init( void *cc, unsigned size )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
shabal_8x32_context *sc = (shabal_8x32_context*)cc;
if ( size == 512 )
{ // copy immediate constants directly to working registers later.
@@ -1095,9 +1082,9 @@ shabal_8way_init( void *cc, unsigned size )
}
static void
shabal_8way_core( void *cc, const unsigned char *data, size_t len )
shabal_8x32_core( void *cc, const unsigned char *data, size_t len )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
shabal_8x32_context *sc = (shabal_8x32_context*)cc;
__m256i *buf;
__m256i *vdata = (__m256i*)data;
const int buf_size = 64;
@@ -1146,10 +1133,10 @@ shabal_8way_core( void *cc, const unsigned char *data, size_t len )
}
static void
shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
shabal_8x32_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
shabal_8x32_context *sc = (shabal_8x32_context*)cc;
__m256i *buf;
const int buf_size = 64;
size_t ptr;
@@ -1192,52 +1179,39 @@ shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
}
void
shabal256_8way_init( void *cc )
shabal256_8x32_init( void *cc )
{
shabal_8way_init(cc, 256);
shabal_8x32_init(cc, 256);
}
void
shabal256_8way_update( void *cc, const void *data, size_t len )
shabal256_8x32_update( void *cc, const void *data, size_t len )
{
shabal_8way_core( cc, data, len );
shabal_8x32_core( cc, data, len );
}
void
shabal256_8way_close( void *cc, void *dst )
shabal256_8x32_close( void *cc, void *dst )
{
shabal_8way_close(cc, 0, 0, dst, 8);
shabal_8x32_close(cc, 0, 0, dst, 8);
}
void
shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst )
shabal512_8x32_init(void *cc)
{
shabal_8way_close(cc, ub, n, dst, 8);
shabal_8x32_init(cc, 512);
}
void
shabal512_8way_init(void *cc)
shabal512_8x32_update(void *cc, const void *data, size_t len)
{
shabal_8way_init(cc, 512);
shabal_8x32_core(cc, data, len);
}
void
shabal512_8way_update(void *cc, const void *data, size_t len)
shabal512_8x32_close(void *cc, void *dst)
{
shabal_8way_core(cc, data, len);
}
void
shabal512_8way_close(void *cc, void *dst)
{
shabal_8way_close(cc, 0, 0, dst, 16);
}
void
shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shabal_8way_close(cc, ub, n, dst, 16);
shabal_8x32_close(cc, 0, 0, dst, 16);
}
#endif // AVX2
@@ -1674,9 +1648,9 @@ static const sph_u32 C_init_512[] = {
*/
static void
shabal_4way_init( void *cc, unsigned size )
shabal_4x32_init( void *cc, unsigned size )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
shabal_4x32_context *sc = (shabal_4x32_context*)cc;
if ( size == 512 )
{ // copy immediate constants directly to working registers later.
@@ -1786,9 +1760,9 @@ shabal_4way_init( void *cc, unsigned size )
}
static void
shabal_4way_core( void *cc, const unsigned char *data, size_t len )
shabal_4x32_core( void *cc, const unsigned char *data, size_t len )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
shabal_4x32_context *sc = (shabal_4x32_context*)cc;
v128_t *buf;
v128_t *vdata = (v128_t*)data;
const int buf_size = 64;
@@ -1838,10 +1812,10 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
}
static void
shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
shabal_4x32_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
shabal_4x32_context *sc = (shabal_4x32_context*)cc;
v128_t *buf;
const int buf_size = 64;
size_t ptr;
@@ -1884,52 +1858,39 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
}
void
shabal256_4way_init( void *cc )
shabal256_4x32_init( void *cc )
{
shabal_4way_init(cc, 256);
shabal_4x32_init(cc, 256);
}
void
shabal256_4way_update( void *cc, const void *data, size_t len )
shabal256_4x32_update( void *cc, const void *data, size_t len )
{
shabal_4way_core( cc, data, len );
shabal_4x32_core( cc, data, len );
}
void
shabal256_4way_close( void *cc, void *dst )
shabal256_4x32_close( void *cc, void *dst )
{
shabal_4way_close(cc, 0, 0, dst, 8);
shabal_4x32_close(cc, 0, 0, dst, 8);
}
void
shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst )
shabal512_4x32_init(void *cc)
{
shabal_4way_close(cc, ub, n, dst, 8);
shabal_4x32_init(cc, 512);
}
void
shabal512_4way_init(void *cc)
shabal512_4x32_update(void *cc, const void *data, size_t len)
{
shabal_4way_init(cc, 512);
shabal_4x32_core(cc, data, len);
}
void
shabal512_4way_update(void *cc, const void *data, size_t len)
shabal512_4x32_close(void *cc, void *dst)
{
shabal_4way_core(cc, data, len);
}
void
shabal512_4way_close(void *cc, void *dst)
{
shabal_4way_close(cc, 0, 0, dst, 16);
}
void
shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shabal_4way_close(cc, ub, n, dst, 16);
shabal_4x32_close(cc, 0, 0, dst, 16);
}
#endif

View File

@@ -4,10 +4,6 @@
#include <stddef.h>
#include "simd-utils.h"
#define SPH_SIZE_shabal256 256
#define SPH_SIZE_shabal512 512
#if defined(SIMD512)
typedef struct {
@@ -16,22 +12,27 @@ typedef struct {
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_16way_context __attribute__ ((aligned (64)));
} shabal_16x32_context __attribute__ ((aligned (64)));
typedef shabal_16way_context shabal256_16way_context;
typedef shabal_16way_context shabal512_16way_context;
typedef shabal_16x32_context shabal256_16x32_context;
typedef shabal_16x32_context shabal512_16x32_context;
void shabal256_16way_init( void *cc );
void shabal256_16way_update( void *cc, const void *data, size_t len );
void shabal256_16way_close( void *cc, void *dst );
void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal256_16x32_init( void *cc );
void shabal256_16x32_update( void *cc, const void *data, size_t len );
void shabal256_16x32_close( void *cc, void *dst );
void shabal512_16way_init( void *cc );
void shabal512_16way_update( void *cc, const void *data, size_t len );
void shabal512_16way_close( void *cc, void *dst );
void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_16x32_init( void *cc );
void shabal512_16x32_update( void *cc, const void *data, size_t len );
void shabal512_16x32_close( void *cc, void *dst );
#define shabal256_16way_context shabal256_16x32_context
#define shabal256_16way_init shabal256_16x32_init
#define shabal256_16way_update shabal256_16x32_update
#define shabal256_16way_close shabal256_16x32_close
#define shabal512_16way_context shabal512_16x32_context
#define shabal512_16way_init shabal512_16x32_init
#define shabal512_16way_update shabal512_16x32_update
#define shabal512_16way_close shabal512_16x32_close
#endif
@@ -43,22 +44,27 @@ typedef struct {
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_8way_context __attribute__ ((aligned (64)));
} shabal_8x32_context __attribute__ ((aligned (64)));
typedef shabal_8way_context shabal256_8way_context;
typedef shabal_8way_context shabal512_8way_context;
typedef shabal_8x32_context shabal256_8x32_context;
typedef shabal_8x32_context shabal512_8x32_context;
void shabal256_8way_init( void *cc );
void shabal256_8way_update( void *cc, const void *data, size_t len );
void shabal256_8way_close( void *cc, void *dst );
void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal256_8x32_init( void *cc );
void shabal256_8x32_update( void *cc, const void *data, size_t len );
void shabal256_8x32_close( void *cc, void *dst );
void shabal512_8way_init( void *cc );
void shabal512_8way_update( void *cc, const void *data, size_t len );
void shabal512_8way_close( void *cc, void *dst );
void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_8x32_init( void *cc );
void shabal512_8x32_update( void *cc, const void *data, size_t len );
void shabal512_8x32_close( void *cc, void *dst );
#define shabal256_8way_context shabal256_8x32_context
#define shabal256_8way_init shabal256_8x32_init
#define shabal256_8way_update shabal256_8x32_update
#define shabal256_8way_close shabal256_8x32_close
#define shabal512_8way_context shabal512_8x32_context
#define shabal512_8way_init shabal512_8x32_init
#define shabal512_8way_update shabal512_8x32_update
#define shabal512_8way_close shabal512_8x32_close
#endif
@@ -70,59 +76,29 @@ typedef struct {
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_4way_context;
} shabal_4x32_context;
typedef shabal_4way_context shabal256_4way_context;
typedef shabal_4way_context shabal512_4way_context;
typedef shabal_4x32_context shabal256_4x32_context;
typedef shabal_4x32_context shabal512_4x32_context;
void shabal256_4way_init( void *cc );
void shabal256_4way_update( void *cc, const void *data, size_t len );
void shabal256_4way_close( void *cc, void *dst );
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal256_4x32_init( void *cc );
void shabal256_4x32_update( void *cc, const void *data, size_t len );
void shabal256_4x32_close( void *cc, void *dst );
void shabal512_4way_init( void *cc );
void shabal512_4way_update( void *cc, const void *data, size_t len );
void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_4x32_init( void *cc );
void shabal512_4x32_update( void *cc, const void *data, size_t len );
void shabal512_4x32_close( void *cc, void *dst );
#define shabal256_4way_context shabal256_4x32_context
#define shabal256_4way_init shabal256_4x32_init
#define shabal256_4way_update shabal256_4x32_update
#define shabal256_4way_close shabal256_4x32_close
#define shabal512_4way_context shabal512_4x32_context
#define shabal512_4way_init shabal512_4x32_init
#define shabal512_4way_update shabal512_4x32_update
#define shabal512_4way_close shabal512_4x32_close
#endif
// SSE or NEON
/* No __mullo_pi32
typedef struct
{
v64_t buf[16] __attribute__ ((aligned (64)));
v64_t A[12], B[16], C[16];
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_2x32_context;
typedef shabal_2x32_context shabal256_2x32_context;
typedef shabal_2x32_context shabal512_2x32_context;
void shabal256_2x32_init( void *cc );
void shabal256_2x32_update( void *cc, const void *data, size_t len );
void shabal256_2x32_close( void *cc, void *dst );
void shabal256_2x32_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_2x32_init( shabal512_2x32_context *cc );
void shabal512_2x32_update( shabal512_2x32_context *cc, const void *data,
size_t len );
void shabal512_2x32_close( shabal512_2x32_context *cc, void *dst );
void shabal512_2x32_addbits_and_close( shabal512_2x32_context *cc,
unsigned ub, unsigned n, void *dst );
void shabal512_2x32_ctx( shabal512_2x32_context *cc, void *dst,
const void *data, size_t len );
void shabal512_2x32( shabal512_2x32_context *dst, const void *data,
size_t len );
*/
#endif

View File

@@ -6,23 +6,23 @@
#if defined (SKEIN_8WAY)
static __thread skein512_8way_context skein512_8way_ctx
static __thread skein512_8x64_context skein512_8x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_8way( void *state, const void *input )
{
uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
skein512_8way_context ctx_skein;
memcpy( &ctx_skein, &skein512_8way_ctx, sizeof( ctx_skein ) );
skein512_8x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_8x64_ctx, sizeof( ctx_skein ) );
uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
sha256_8way_context ctx_sha256;
sha256_8x32_context ctx_sha256;
skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) );
skein512_8x64_final16( &ctx_skein, vhash64, input + (64*8) );
rintrlv_8x64_8x32( vhash32, vhash64, 512 );
sha256_8way_init( &ctx_sha256 );
sha256_8way_update( &ctx_sha256, vhash32, 64 );
sha256_8way_close( &ctx_sha256, state );
sha256_8x32_init( &ctx_sha256 );
sha256_8x32_update( &ctx_sha256, vhash32, 64 );
sha256_8x32_close( &ctx_sha256, state );
}
int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -46,7 +46,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &skein512_8way_ctx, vdata );
skein512_8x64_prehash64( &skein512_8x64_ctx, vdata );
do
{
skeinhash_8way( hash, vdata );
@@ -73,14 +73,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
#elif defined (SKEIN_4WAY)
static __thread skein512_4way_context skein512_4way_ctx
static __thread skein512_4x64_context skein512_4x64_ctx
__attribute__ ((aligned (64)));
void skeinhash_4way( void *state, const void *input )
{
uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
skein512_4way_context ctx_skein;
memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) );
skein512_4x64_context ctx_skein;
memcpy( &ctx_skein, &skein512_4x64_ctx, sizeof( ctx_skein ) );
#if defined(__SHA__)
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
@@ -88,10 +88,10 @@ void skeinhash_4way( void *state, const void *input )
uint32_t hash3[16] __attribute__ ((aligned (64)));
#else
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx_sha256;
sha256_4x32_context ctx_sha256;
#endif
skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );
skein512_4x64_final16( &ctx_skein, vhash64, input + (64*4) );
#if defined(__SHA__)
@@ -107,9 +107,9 @@ void skeinhash_4way( void *state, const void *input )
#else
rintrlv_4x64_4x32( vhash32, vhash64, 512 );
sha256_4way_init( &ctx_sha256 );
sha256_4way_update( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, state );
sha256_4x32_init( &ctx_sha256 );
sha256_4x32_update( &ctx_sha256, vhash32, 64 );
sha256_4x32_close( &ctx_sha256, state );
#endif
}
@@ -132,7 +132,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &skein512_4way_ctx, vdata );
skein512_4x64_prehash64( &skein512_4x64_ctx, vdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );

View File

@@ -513,7 +513,7 @@ do { \
#if defined(SIMD512)
void skein256_8way_init( skein256_8way_context *sc )
void skein256_8x64_init( skein256_8x64_context *sc )
{
sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
@@ -527,7 +527,7 @@ void skein256_8way_init( skein256_8way_context *sc )
sc->ptr = 0;
}
void skein512_8way_init( skein512_8way_context *sc )
void skein512_8x64_init( skein512_8x64_context *sc )
{
sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
@@ -542,7 +542,7 @@ void skein512_8way_init( skein512_8way_context *sc )
}
static void
skein_big_core_8way( skein512_8way_context *sc, const void *data,
skein_big_core_8x64( skein512_8x64_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -587,7 +587,7 @@ skein_big_core_8way( skein512_8way_context *sc, const void *data,
}
static void
skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
skein_big_close_8x64( skein512_8x64_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_len )
{
__m512i *buf;
@@ -621,7 +621,7 @@ skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n,
memcpy_512( dst, buf, out_len >> 3 );
}
void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
void skein512_8x64_full( skein512_8x64_context *sc, void *out, const void *data,
size_t len )
{
__m512i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -698,7 +698,7 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
}
void
skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf = sc->buf;
@@ -732,7 +732,7 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
}
void
skein512_8way_final16( skein512_8way_context *sc, void *output,
skein512_8x64_final16( skein512_8x64_context *sc, void *output,
const void *data )
{
__m512i *in = (__m512i*)data;
@@ -778,34 +778,34 @@ skein512_8way_final16( skein512_8way_context *sc, void *output,
void
skein256_8way_update(void *cc, const void *data, size_t len)
skein256_8x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_8way(cc, data, len);
skein_big_core_8x64(cc, data, len);
}
void
skein256_8way_close(void *cc, void *dst)
skein256_8x64_close(void *cc, void *dst)
{
skein_big_close_8way(cc, 0, 0, dst, 32);
skein_big_close_8x64(cc, 0, 0, dst, 32);
}
void
skein512_8way_update(void *cc, const void *data, size_t len)
skein512_8x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_8way(cc, data, len);
skein_big_core_8x64(cc, data, len);
}
void
skein512_8way_close(void *cc, void *dst)
skein512_8x64_close(void *cc, void *dst)
{
skein_big_close_8way(cc, 0, 0, dst, 64);
skein_big_close_8x64(cc, 0, 0, dst, 64);
}
#endif // AVX512
#if defined(__AVX2__)
void skein256_4way_init( skein256_4way_context *sc )
void skein256_4x64_init( skein256_4x64_context *sc )
{
sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
@@ -819,7 +819,7 @@ void skein256_4way_init( skein256_4way_context *sc )
sc->ptr = 0;
}
void skein512_4way_init( skein512_4way_context *sc )
void skein512_4x64_init( skein512_4x64_context *sc )
{
sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
@@ -835,7 +835,7 @@ void skein512_4way_init( skein512_4way_context *sc )
// Do not use for 128 bt data length
static void
skein_big_core_4way( skein512_4way_context *sc, const void *data,
skein_big_core_4x64( skein512_4x64_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
@@ -882,7 +882,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
}
static void
skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
skein_big_close_4x64( skein512_4x64_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_len )
{
__m256i *buf;
@@ -920,7 +920,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
}
void
skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
skein512_4x64_full( skein512_4x64_context *sc, void *out, const void *data,
size_t len )
{
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
@@ -995,7 +995,7 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
}
void
skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf = sc->buf;
@@ -1029,7 +1029,7 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
}
void
skein512_4way_final16( skein512_4way_context *sc, void *out, const void *data )
skein512_4x64_final16( skein512_4x64_context *sc, void *out, const void *data )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf = sc->buf;
@@ -1073,29 +1073,29 @@ skein512_4way_final16( skein512_4way_context *sc, void *out, const void *data )
// Broken for 80 bytes, use prehash.
void
skein256_4way_update(void *cc, const void *data, size_t len)
skein256_4x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_4way(cc, data, len);
skein_big_core_4x64(cc, data, len);
}
void
skein256_4way_close(void *cc, void *dst)
skein256_4x64_close(void *cc, void *dst)
{
skein_big_close_4way(cc, 0, 0, dst, 32);
skein_big_close_4x64(cc, 0, 0, dst, 32);
}
// Broken for 80 & 128 bytes, use prehash or full
void
skein512_4way_update(void *cc, const void *data, size_t len)
skein512_4x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_4way(cc, data, len);
skein_big_core_4x64(cc, data, len);
}
void
skein512_4way_close(void *cc, void *dst)
skein512_4x64_close(void *cc, void *dst)
{
skein_big_close_4way(cc, 0, 0, dst, 64);
skein_big_close_4x64(cc, 0, 0, dst, 64);
}
#endif // AVX2
@@ -1231,7 +1231,7 @@ void skein512_2x64_init( skein512_2x64_context *sc )
}
static void
skein_big_core_2way( skein512_2x64_context *sc, const void *data,
skein_big_core_2x64( skein512_2x64_context *sc, const void *data,
size_t len )
{
v128u64_t *vdata = (v128u64_t*)data;
@@ -1278,7 +1278,7 @@ skein_big_core_2way( skein512_2x64_context *sc, const void *data,
}
static void
skein_big_close_2way( skein512_2x64_context *sc, unsigned ub, unsigned n,
skein_big_close_2x64( skein512_2x64_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_len )
{
v128u64_t *buf;
@@ -1471,13 +1471,13 @@ skein512_2x64_final16( skein512_2x64_context *sc, void *out, const void *data )
void
skein256_2x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_2way(cc, data, len);
skein_big_core_2x64(cc, data, len);
}
void
skein256_2x64_close(void *cc, void *dst)
{
skein_big_close_2way(cc, 0, 0, dst, 32);
skein_big_close_2x64(cc, 0, 0, dst, 32);
}
@@ -1485,13 +1485,12 @@ skein256_2x64_close(void *cc, void *dst)
void
skein512_2x64_update(void *cc, const void *data, size_t len)
{
skein_big_core_2way(cc, data, len);
skein_big_core_2x64(cc, data, len);
}
void
skein512_2x64_close(void *cc, void *dst)
{
skein_big_close_2way(cc, 0, 0, dst, 64);
skein_big_close_2x64(cc, 0, 0, dst, 64);
}

View File

@@ -52,24 +52,36 @@ typedef struct
__m512i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
uint64_t bcount;
} skein_8way_big_context __attribute__ ((aligned (128)));
} skein_8x64_big_context __attribute__ ((aligned (128)));
typedef skein_8way_big_context skein512_8way_context;
typedef skein_8way_big_context skein256_8way_context;
typedef skein_8x64_big_context skein512_8x64_context;
typedef skein_8x64_big_context skein256_8x64_context;
void skein512_8way_full( skein512_8way_context *sc, void *out,
void skein512_8x64_full( skein512_8x64_context *sc, void *out,
const void *data, size_t len );
void skein512_8way_init( skein512_8way_context *sc );
void skein512_8way_update( void *cc, const void *data, size_t len );
void skein512_8way_close( void *cc, void *dst );
void skein512_8x64_init( skein512_8x64_context *sc );
void skein512_8x64_update( void *cc, const void *data, size_t len );
void skein512_8x64_close( void *cc, void *dst );
void skein512_8way_prehash64( skein512_8way_context *sc, const void *data );
void skein512_8way_final16( skein512_8way_context *sc, void *out,
void skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data );
void skein512_8x64_final16( skein512_8x64_context *sc, void *out,
const void *data );
void skein256_8way_init( skein256_8way_context *sc );
void skein256_8way_update( void *cc, const void *data, size_t len );
void skein256_8way_close( void *cc, void *dst );
void skein256_8x64_init( skein256_8x64_context *sc );
void skein256_8x64_update( void *cc, const void *data, size_t len );
void skein256_8x64_close( void *cc, void *dst );
#define skein512_8way_context skein512_8x64_context
#define skein512_8way_full skein512_8x64_full
#define skein512_8way_init skein512_8x64_init
#define skein512_8way_update skein512_8x64_update
#define skein512_8way_close skein512_8x64_close
#define skein512_8way_prehash64 skein512_8x64_prehash64
#define skein512_8way_final16 skein512_8x64_final16
#define skein256_8way_context skein256_8x64_context
#define skein256_8way_init skein256_8x64_init
#define skein256_8way_update skein256_8x64_update
#define skein256_8way_close skein256_8x64_close
#endif // AVX512
@@ -81,25 +93,35 @@ typedef struct
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
uint64_t bcount;
} skein_4way_big_context __attribute__ ((aligned (128)));
} skein_4x64_big_context __attribute__ ((aligned (128)));
typedef skein_4way_big_context skein512_4way_context;
typedef skein_4way_big_context skein256_4way_context;
typedef skein_4x64_big_context skein512_4x64_context;
typedef skein_4x64_big_context skein256_4x64_context;
void skein512_4way_init( skein512_4way_context *sc );
void skein512_4way_full( skein512_4way_context *sc, void *out,
void skein512_4x64_init( skein512_4x64_context *sc );
void skein512_4x64_full( skein512_4x64_context *sc, void *out,
const void *data, size_t len );
void skein512_4way_update( void *cc, const void *data, size_t len );
void skein512_4way_close( void *cc, void *dst );
void skein256_4way_init( skein256_4way_context *sc );
void skein256_4way_update( void *cc, const void *data, size_t len );
void skein256_4way_close( void *cc, void *dst );
void skein512_4way_prehash64( skein512_4way_context *sc, const void *data );
void skein512_4way_final16( skein512_4way_context *sc, void *out,
void skein512_4x64_update( void *cc, const void *data, size_t len );
void skein512_4x64_close( void *cc, void *dst );
void skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data );
void skein512_4x64_final16( skein512_4x64_context *sc, void *out,
const void *data );
void skein256_4x64_init( skein256_4x64_context *sc );
void skein256_4x64_update( void *cc, const void *data, size_t len );
void skein256_4x64_close( void *cc, void *dst );
#define skein512_4way_context skein512_4x64_context
#define skein512_4way_full skein512_4x64_full
#define skein512_4way_init skein512_4x64_init
#define skein512_4way_update skein512_4x64_update
#define skein512_4way_close skein512_4x64_close
#define skein512_4way_prehash64 skein512_4x64_prehash64
#define skein512_4way_final16 skein512_4x64_final16
#define skein256_4way_context skein256_4x64_context
#define skein256_4way_init skein256_4x64_init
#define skein256_4way_update skein256_4x64_update
#define skein256_4way_close skein256_4x64_close
#endif
@@ -109,10 +131,10 @@ typedef struct
v128u64_t h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
uint64_t bcount;
} skein_2way_big_context __attribute__ ((aligned (128)));
} skein_2x64_big_context __attribute__ ((aligned (128)));
typedef skein_2way_big_context skein512_2x64_context;
typedef skein_2way_big_context skein256_2x64_context;
typedef skein_2x64_big_context skein512_2x64_context;
typedef skein_2x64_big_context skein256_2x64_context;
void skein512_2x64_init( skein512_2x64_context *sc );
void skein512_2x64_full( skein512_2x64_context *sc, void *out,

View File

@@ -21,17 +21,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
skein512_8way_context ctx;
skein512_8x64_context ctx;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
skein512_8way_prehash64( &ctx, vdata );
skein512_8x64_prehash64( &ctx, vdata );
do
{
skein512_8way_final16( &ctx, hash, vdata + (16*8) );
skein512_8way_full( &ctx, hash, hash, 64 );
skein512_8x64_final16( &ctx, hash, vdata + (16*8) );
skein512_8x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) )
@@ -71,16 +71,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
skein512_4way_context ctx;
skein512_4x64_context ctx;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &ctx, vdata );
skein512_4x64_prehash64( &ctx, vdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
skein512_4way_final16( &ctx, hash, vdata + (16*4) );
skein512_4way_full( &ctx, hash, hash, 64 );
skein512_4x64_final16( &ctx, hash, vdata + (16*4) );
skein512_4x64_full( &ctx, hash, hash, 64 );
for ( int lane = 0; lane < 4; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )

View File

@@ -189,7 +189,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
v128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = swab32(pdata[17]);
uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{
hex_getAlgoString( (const uint32_t*) (&edata[1]), x16r_hash_order );

View File

@@ -31,18 +31,18 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
{
case JH:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16r_ctx.jh );
jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
jh512_8x64_init( &x16r_ctx.jh );
jh512_8x64_update( &x16r_ctx.jh, vdata, 64 );
break;
case KECCAK:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
keccak512_8way_init( &x16r_ctx.keccak );
keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 );
keccak512_8x64_init( &x16r_ctx.keccak );
keccak512_8x64_update( &x16r_ctx.keccak, vdata, 72 );
break;
case SKEIN:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16r_ctx.skein );
skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
skein512_8x64_init( &x16r_ctx.skein );
skein512_8x64_update( &x16r_ctx.skein, vdata, 64 );
break;
case LUFFA:
{
@@ -78,8 +78,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16r_ctx.hamsi );
hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
hamsi512_8x64_init( &x16r_ctx.hamsi );
hamsi512_8x64_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
@@ -90,8 +90,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order )
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
shabal512_8way_init( &x16r_ctx.shabal );
shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
shabal512_8x32_init( &x16r_ctx.shabal );
shabal512_8x32_update( &x16r_ctx.shabal, vdata2, 64 );
rintrlv_8x32_8x64( vdata, vdata2, 640 );
break;
case WHIRLPOOL:
@@ -146,27 +146,27 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
{
case BLAKE:
if ( i == 0 )
blake512_8way_full( &ctx.blake, vhash, input, size );
blake512_8x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_full( &ctx.blake, vhash, vhash, size );
blake512_8x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
bmw512_8x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
bmw512_8x64_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
bmw512_8x64_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
bmw512_8x64_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -191,43 +191,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
break;
case JH:
if ( i == 0 )
jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, size );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
if ( i == 0 )
keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 );
keccak512_8x64_update( &ctx.keccak, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, size );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, size );
}
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case SKEIN:
if ( i == 0 )
skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, size );
skein512_8x64_init( &ctx.skein );
skein512_8x64_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
skein512_8x64_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -333,15 +333,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -388,13 +388,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
if ( i == 0 )
shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
else
{
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, size );
}
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -438,16 +438,16 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid,
}
break;
case SHA_512:
sha512_8way_init( &ctx.sha512 );
sha512_8x64_init( &ctx.sha512 );
if ( i == 0 )
sha512_8way_update( &ctx.sha512, input, size );
sha512_8x64_update( &ctx.sha512, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
sha512_8way_update( &ctx.sha512, vhash, size );
sha512_8x64_update( &ctx.sha512, vhash, size );
}
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -556,17 +556,17 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
{
case JH:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16r_ctx.jh );
jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
jh512_4x64_init( &x16r_ctx.jh );
jh512_4x64_update( &x16r_ctx.jh, vdata, 64 );
break;
case KECCAK:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
keccak512_4way_init( &x16r_ctx.keccak );
keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 );
keccak512_4x64_init( &x16r_ctx.keccak );
keccak512_4x64_update( &x16r_ctx.keccak, vdata, 72 );
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16r_ctx.skein, vdata );
skein512_4x64_prehash64( &x16r_ctx.skein, vdata );
break;
case LUFFA:
{
@@ -599,8 +599,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16r_ctx.hamsi );
hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
hamsi512_4x64_init( &x16r_ctx.hamsi );
hamsi512_4x64_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
@@ -610,8 +610,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order )
break;
case SHABAL:
v128_bswap32_intrlv80_4x32( vdata2, pdata );
shabal512_4way_init( &x16r_ctx.shabal );
shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
shabal512_4x32_init( &x16r_ctx.shabal );
shabal512_4x32_update( &x16r_ctx.shabal, vdata2, 64 );
rintrlv_4x32_4x64( vdata, vdata2, 640 );
break;
case WHIRLPOOL:
@@ -652,24 +652,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
{
case BLAKE:
if ( i == 0 )
blake512_4way_full( &ctx.blake, vhash, input, size );
blake512_4x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way_full( &ctx.blake, vhash, vhash, size );
blake512_4x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
bmw512_4x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way_update( &ctx.bmw, input, size );
bmw512_4x64_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way_update( &ctx.bmw, vhash, size );
bmw512_4x64_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_close( &ctx.bmw, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case GROESTL:
@@ -689,35 +689,35 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
break;
case JH:
if ( i == 0 )
jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, size );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_close( &ctx.jh, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case KECCAK:
if ( i == 0 )
keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 );
keccak512_4x64_update( &ctx.keccak, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, size );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case SKEIN:
if ( i == 0 )
skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way_full( &ctx.skein, vhash, vhash, size );
skein512_4x64_full( &ctx.skein, vhash, vhash, size );
}
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
@@ -809,14 +809,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
@@ -845,13 +845,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
if ( i == 0 )
shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
else
{
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, size );
}
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
break;
case WHIRLPOOL:
@@ -878,16 +878,16 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid,
}
break;
case SHA_512:
sha512_4way_init( &ctx.sha512 );
sha512_4x64_init( &ctx.sha512 );
if ( i == 0 )
sha512_4way_update( &ctx.sha512, input, size );
sha512_4x64_update( &ctx.sha512, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, size );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, size );
}
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_close( &ctx.sha512, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
}

View File

@@ -125,19 +125,19 @@ bool register_x21s__algo( algo_gate_t* gate );
union _x16r_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sha512_8x64_context sha512;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -170,8 +170,8 @@ int scanhash_x16r_8way( struct work *, uint32_t ,
union _x16r_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
@@ -181,17 +181,17 @@ union _x16r_4way_context_overlay
shavite512_context shavite;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sha512_4x64_context sha512;
} __attribute__ ((aligned (64)));
#define _x16r_4x64_context_overlay _x16r_4way_context_overlay

View File

@@ -20,7 +20,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
v128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80;
if ( s_ntime != masked_ntime )
{
x16rt_getTimeHash( masked_ntime, &timeHash );
@@ -28,7 +28,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
s_ntime = masked_ntime;
if ( !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, swab32( pdata[17] ), timeHash );
x16r_hash_order, bswap_32( pdata[17] ), timeHash );
}
x16r_prehash( edata, pdata, x16r_hash_order );

View File

@@ -14,19 +14,19 @@
union _x16rv2_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cubehashParam cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sha512_8x64_context sha512;
sph_tiger_context tiger;
#if defined(__VAES__)
groestl512_4way_context groestl;
@@ -76,29 +76,29 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
blake512_8x64_init( &ctx.blake );
if ( i == 0 )
blake512_8way_full( &ctx.blake, vhash, input, size );
blake512_8x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_full( &ctx.blake, vhash, vhash, size );
blake512_8x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
bmw512_8x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
bmw512_8x64_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
bmw512_8x64_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
bmw512_8x64_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -123,15 +123,15 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
break;
case JH:
if ( i == 0 )
jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
jh512_8x64_update( &ctx.jh, input + (64<<3), 16 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, size );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -165,30 +165,30 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
}
else
{
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in4, size );
sph_tiger_close( &ctx.tiger, hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in5, size );
sph_tiger_close( &ctx.tiger, hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in6, size );
sph_tiger_close( &ctx.tiger, hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in7, size );
sph_tiger_close( &ctx.tiger, hash7 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in4, size );
sph_tiger_close( &ctx.tiger, hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in5, size );
sph_tiger_close( &ctx.tiger, hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in6, size );
sph_tiger_close( &ctx.tiger, hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in7, size );
sph_tiger_close( &ctx.tiger, hash7 );
}
for ( int i = (24/4); i < (64/4); i++ )
@@ -197,23 +197,23 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case SKEIN:
if ( i == 0 )
skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
skein512_8x64_update( &ctx.skein, input + (64<<3), 16 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, size );
skein512_8x64_init( &ctx.skein );
skein512_8x64_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
skein512_8x64_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -395,16 +395,16 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -451,13 +451,13 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
if ( i == 0 )
shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 );
else
{
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, size );
}
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -562,9 +562,9 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
@@ -623,8 +623,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
{
case JH:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
jh512_8way_init( &x16rv2_ctx.jh );
jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
jh512_8x64_init( &x16rv2_ctx.jh );
jh512_8x64_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
@@ -637,8 +637,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
break;
case SKEIN:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
skein512_8way_init( &x16rv2_ctx.skein );
skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
skein512_8x64_init( &x16rv2_ctx.skein );
skein512_8x64_update( &x16rv2_ctx.skein, vdata, 64 );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
@@ -649,8 +649,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
break;
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16rv2_ctx.hamsi );
hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 );
hamsi512_8x64_init( &x16rv2_ctx.hamsi );
hamsi512_8x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
@@ -661,8 +661,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
shabal512_8way_init( &x16rv2_ctx.shabal );
shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
shabal512_8x32_init( &x16rv2_ctx.shabal );
shabal512_8x32_update( &x16rv2_ctx.shabal, vdata2, 64 );
rintrlv_8x32_8x64( vdata, vdata2, 640 );
break;
case WHIRLPOOL:
@@ -701,8 +701,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
union _x16rv2_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
shavite512_2way_context shavite;
@@ -712,17 +712,17 @@ union _x16rv2_4way_context_overlay
shavite512_context shavite;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
sha512_4x64_context sha512;
sph_tiger_context tiger;
};
typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
@@ -761,24 +761,24 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
{
case BLAKE:
if ( i == 0 )
blake512_4way_full( &ctx.blake, vhash, input, size );
blake512_4x64_full( &ctx.blake, vhash, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
blake512_4way_full( &ctx.blake, vhash, vhash, size );
blake512_4x64_full( &ctx.blake, vhash, vhash, size );
}
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
bmw512_4x64_init( &ctx.bmw );
if ( i == 0 )
bmw512_4way_update( &ctx.bmw, input, size );
bmw512_4x64_update( &ctx.bmw, input, size );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
bmw512_4way_update( &ctx.bmw, vhash, size );
bmw512_4x64_update( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_close( &ctx.bmw, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case GROESTL:
@@ -798,14 +798,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
break;
case JH:
if ( i == 0 )
jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
jh512_4x64_update( &ctx.jh, input + (64<<2), 16 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, size );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_close( &ctx.jh, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case KECCAK:
@@ -842,20 +842,20 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
case SKEIN:
if ( i == 0 )
skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way_update( &ctx.skein, vhash, size );
skein512_4way_close( &ctx.skein, vhash );
skein512_4x64_init( &ctx.skein );
skein512_4x64_update( &ctx.skein, vhash, size );
skein512_4x64_close( &ctx.skein, vhash );
}
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
@@ -976,14 +976,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, size );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, size );
}
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
@@ -1012,13 +1012,13 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
if ( i == 0 )
shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 );
else
{
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, size );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, size );
}
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
break;
case WHIRLPOOL:
@@ -1078,9 +1078,9 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
break;
}
@@ -1133,8 +1133,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
{
case JH:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
jh512_4way_init( &x16rv2_ctx.jh );
jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
jh512_4x64_init( &x16rv2_ctx.jh );
jh512_4x64_update( &x16rv2_ctx.jh, vdata, 64 );
break;
case KECCAK:
case LUFFA:
@@ -1146,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case SKEIN:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
skein512_4way_prehash64( &x16rv2_ctx.skein, vdata );
skein512_4x64_prehash64( &x16rv2_ctx.skein, vdata );
break;
case CUBEHASH:
v128_bswap32_80( edata, pdata );
@@ -1156,8 +1156,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16rv2_ctx.hamsi );
hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 );
hamsi512_4x64_init( &x16rv2_ctx.hamsi );
hamsi512_4x64_update( &x16rv2_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
v128_bswap32_80( edata, pdata );
@@ -1167,8 +1167,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
break;
case SHABAL:
v128_bswap32_intrlv80_4x32( vdata32, pdata );
shabal512_4way_init( &x16rv2_ctx.shabal );
shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
shabal512_4x32_init( &x16rv2_ctx.shabal );
shabal512_4x32_update( &x16rv2_ctx.shabal, vdata32, 64 );
rintrlv_4x32_4x64( vdata, vdata32, 640 );
break;
case WHIRLPOOL:

View File

@@ -168,7 +168,7 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
uint32_t ntime = bswap_32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )

View File

@@ -21,10 +21,10 @@ static __thread uint64_t* x21s_8way_matrix;
union _x21s_8way_context_overlay
{
haval256_5_8way_context haval;
haval256_8x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
sha256_8way_context sha256;
sha256_8x32_context sha256;
} __attribute__ ((aligned (64)));
typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
@@ -50,9 +50,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhash, 64 );
haval256_8x32_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
@@ -122,9 +122,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
sha256_8x32_init( &ctx.sha256 );
sha256_8x32_update( &ctx.sha256, vhash, 64 );
sha256_8x32_close( &ctx.sha256, output );
return 1;
}
@@ -202,11 +202,11 @@ static __thread uint64_t* x21s_4way_matrix;
union _x21s_4way_context_overlay
{
haval256_5_4way_context haval;
haval256_4x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if !defined(__SHA__)
sha256_4way_context sha256;
sha256_4x32_context sha256;
#endif
} __attribute__ ((aligned (64)));
@@ -228,9 +228,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhash, 64 );
haval256_4x32_close( &ctx.haval, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -279,9 +279,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
#else
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way_init( &ctx.sha256 );
sha256_4way_update( &ctx.sha256, vhash, 64 );
sha256_4way_close( &ctx.sha256, vhash );
sha256_4x32_init( &ctx.sha256 );
sha256_4x32_update( &ctx.sha256, vhash, 64 );
sha256_4x32_close( &ctx.sha256, vhash );
dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );
#endif

View File

@@ -78,7 +78,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
static __thread uint32_t s_ntime = UINT32_MAX;
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
uint32_t ntime = bswap_32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
s_ntime = ntime;
if ( opt_debug && !thr_id )

View File

@@ -31,20 +31,20 @@
union _sonoa_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -75,9 +75,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
// 1
blake512_8way_full( &ctx.blake, vhash, input, 80 );
blake512_8x64_full( &ctx.blake, vhash, input, 80 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -107,15 +107,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -189,7 +189,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
if ( work_restart[thr_id].restart ) return 0;
// 2
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -219,15 +219,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -298,14 +298,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
if ( work_restart[thr_id].restart ) return 0;
// 3
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -335,17 +335,17 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
skein512_8x64_init( &ctx.skein );
skein512_8x64_update( &ctx.skein, vhash, 64 );
skein512_8x64_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -416,9 +416,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -438,7 +438,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -468,15 +468,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -547,9 +547,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -566,15 +566,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 512 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
#if defined(__VAES__)
@@ -633,13 +633,13 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
if ( work_restart[thr_id].restart ) return 0;
// 5
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
rintrlv_8x64_8x32( vhashA, vhash, 512 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhashA, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
#if defined(__VAES__)
@@ -669,15 +669,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -748,9 +748,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -767,9 +767,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -789,7 +789,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -819,15 +819,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -898,9 +898,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -917,9 +917,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -936,9 +936,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -958,7 +958,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -988,15 +988,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -1067,9 +1067,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -1086,9 +1086,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -1105,15 +1105,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, 512 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, state );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, 64 );
haval256_8x32_close( &ctx.haval, state );
return 1;
}
@@ -1122,8 +1122,8 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id )
union _sonoa_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
echo512_2way_context echo;
@@ -1131,19 +1131,19 @@ union _sonoa_4way_context_overlay
hashState_groestl groestl;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
};
typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
@@ -1161,11 +1161,11 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
// 1
blake512_4way_full( &ctx.blake, vhash, input, 80 );
blake512_4x64_full( &ctx.blake, vhash, input, 80 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1189,15 +1189,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1241,9 +1241,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
if ( work_restart[thr_id].restart ) return 0;
// 2
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1267,15 +1267,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1316,16 +1316,16 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
if ( work_restart[thr_id].restart ) return 0;
// 3
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1349,15 +1349,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1398,9 +1398,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1413,9 +1413,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
// 4
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1439,15 +1439,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1488,9 +1488,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1501,15 +1501,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
rintrlv_4x32_4x64( vhashB, vhash, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhashB, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
#if defined(__VAES__)
@@ -1545,15 +1545,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
// 5
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
rintrlv_4x64_4x32( vhashB, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhashB, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
#if defined(__VAES__)
@@ -1580,15 +1580,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1629,9 +1629,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1642,9 +1642,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1658,9 +1658,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1684,15 +1684,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1733,9 +1733,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1746,9 +1746,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1759,9 +1759,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1775,9 +1775,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -1801,15 +1801,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1850,9 +1850,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1863,9 +1863,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1876,15 +1876,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashB, 64 );
haval256_4x32_close( &ctx.haval, state );
return 1;
}

View File

@@ -31,11 +31,11 @@
union _x17_16way_context_overlay
{
blake512_8way_context blake;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_2buf_context cube;
#if defined(__VAES__)
@@ -48,17 +48,17 @@ union _x17_16way_context_overlay
hashState_echo echo;
#endif
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_16way_context shabal;
shabal512_16x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_16way_context haval;
sha512_8x64_context sha512;
haval256_16x32_context haval;
} __attribute__ ((aligned (64)));
typedef union _x17_16way_context_overlay x17_16way_context_overlay;
static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));
int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
int thr_id )
@@ -85,13 +85,10 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
uint64_t hash15[8] __attribute__ ((aligned (32)));
x17_16way_context_overlay ctx;
memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) );
blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA,
memcpy( &ctx.blake, &blake512_8x64_ctx, sizeof (blake512_8x64_ctx) );
blake512_8x64_final_le( &blake512_8x64_ctx, vhashA, nonceA,
x17_16way_midstate );
blake512_8way_final_le( &ctx.blake, vhashB, nonceB,
blake512_8x64_final_le( &ctx.blake, vhashB, nonceB,
x17_16way_midstate );
bmw512_8x64_full( &ctx.bmw, vhashA, vhashA, 64 );
@@ -140,22 +137,22 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
#endif
skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 );
skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 );
skein512_8x64_full( &ctx.skein, vhashA, vhashA, 64 );
skein512_8x64_full( &ctx.skein, vhashB, vhashB, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhashA, 64 );
jh512_8way_close( &ctx.jh, vhashA );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhashB, 64 );
jh512_8way_close( &ctx.jh, vhashB );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhashA, 64 );
jh512_8x64_close( &ctx.jh, vhashA );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhashB, 64 );
jh512_8x64_close( &ctx.jh, vhashB );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhashA, 64 );
keccak512_8way_close( &ctx.keccak, vhashA );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhashB, 64 );
keccak512_8way_close( &ctx.keccak, vhashB );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhashA, 64 );
keccak512_8x64_close( &ctx.keccak, vhashA );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhashB, 64 );
keccak512_8x64_close( &ctx.keccak, vhashB );
//
rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 );
@@ -310,18 +307,17 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
*/
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
hamsi512_8way_close( &ctx.hamsi, vhashA );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhashA );
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07, vhashA );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhashB, 64 );
hamsi512_8way_close( &ctx.hamsi, vhashB );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhashB, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhashB );
dintrlv_8x64_512( hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15, vhashB );
fugue512_full( &ctx.fugue, hash00, hash00, 64 );
fugue512_full( &ctx.fugue, hash01, hash01, 64 );
fugue512_full( &ctx.fugue, hash02, hash02, 64 );
@@ -344,9 +340,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
shabal512_16way_init( &ctx.shabal );
shabal512_16way_update( &ctx.shabal, vhashA, 64 );
shabal512_16way_close( &ctx.shabal, vhashA );
shabal512_16x32_init( &ctx.shabal );
shabal512_16x32_update( &ctx.shabal, vhashA, 64 );
shabal512_16x32_close( &ctx.shabal, vhashA );
dintrlv_16x32_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07,
@@ -375,12 +371,12 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhashA, 64 );
sha512_8way_close( &ctx.sha512, vhashA );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhashB, 64 );
sha512_8way_close( &ctx.sha512, vhashB );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhashA, 64 );
sha512_8x64_close( &ctx.sha512, vhashA );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhashB, 64 );
sha512_8x64_close( &ctx.sha512, vhashB );
dintrlv_8x64_512( hash00, hash01, hash02, hash03,
hash04, hash05, hash06, hash07, vhashA );
@@ -391,9 +387,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB,
hash08, hash09, hash10, hash11,
hash12, hash13, hash14, hash15 );
haval256_5_16way_init( &ctx.haval );
haval256_5_16way_update( &ctx.haval, vhashA, 64 );
haval256_5_16way_close( &ctx.haval, state );
haval256_16x32_init( &ctx.haval );
haval256_16x32_update( &ctx.haval, vhashA, 64 );
haval256_16x32_close( &ctx.haval, state );
return 1;
}
@@ -425,7 +421,7 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata );
blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_16way_midstate, vdata );
nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ),
_mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) );
@@ -456,11 +452,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce,
union _x17_8way_context_overlay
{
blake512_8way_context blake;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_2buf_context cube;
#if defined(__VAES__)
@@ -473,17 +469,17 @@ union _x17_8way_context_overlay
hashState_echo echo;
#endif
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
} __attribute__ ((aligned (64)));
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));
int x17_8x64_hash( void *state, const void *input, int thr_id )
{
@@ -500,7 +496,7 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
uint64_t hash7[8] __attribute__ ((aligned (32)));
x17_8way_context_overlay ctx;
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ),
x17_8way_midstate );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
@@ -533,15 +529,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -611,9 +607,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -629,9 +625,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
@@ -648,15 +644,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id )
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, 512 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, state );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, 64 );
haval256_8x32_close( &ctx.haval, state );
return 1;
}
@@ -690,7 +686,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata );
blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_8way_midstate, vdata );
do
{
@@ -717,7 +713,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce,
union _x17_4way_context_overlay
{
blake512_4way_context blake;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
@@ -726,24 +722,24 @@ union _x17_4way_context_overlay
hashState_groestl groestl;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
};
typedef union _x17_4way_context_overlay x17_4way_context_overlay;
static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64)));
int x17_4x64_hash( void *state, const void *input, int thr_id )
{
@@ -756,11 +752,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
uint64_t hash3[8] __attribute__ ((aligned (32)));
x17_4way_context_overlay ctx;
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ),
x17_4way_midstate );
// blake512_4way_full( &ctx.blake, vhash, input, 80 );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
@@ -789,13 +783,13 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -836,9 +830,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -849,9 +843,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -862,15 +856,15 @@ int x17_4x64_hash( void *state, const void *input, int thr_id )
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashB, 64 );
haval256_4x32_close( &ctx.haval, state );
return 1;
}
@@ -903,7 +897,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
mm256_intrlv80_4x64( vdata, edata );
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) );
blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata );
blake512_4x64_prehash_le( &blake512_4x64_ctx, x17_4way_midstate, vdata );
do
{

View File

@@ -6,10 +6,8 @@
#if defined(SIMD512)
#define X17_8WAY 1
// #define X17_16X32 1
#elif defined(__AVX2__) && defined(__AES__)
#define X17_4WAY 1
#define X17_8X32 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define X17_2X64 1
#endif

View File

@@ -31,20 +31,20 @@
union _xevan_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -73,10 +73,10 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
const int dataLen = 128;
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_full( &ctx.blake, vhash, input, 80 );
blake512_8x64_full( &ctx.blake, vhash, input, 80 );
memset( &vhash[8<<3], 0, 64<<3 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );
#if defined(__VAES__)
@@ -106,15 +106,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, dataLen );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -185,9 +185,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
@@ -204,9 +204,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
@@ -223,23 +223,23 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, dataLen );
sha512_8x64_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, vhashA );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, dataLen );
haval256_8x32_close( &ctx.haval, vhashA );
rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );
blake512_8x64_full( &ctx.blake, vhash, vhash, dataLen );
bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen );
#if defined(__VAES__)
@@ -269,15 +269,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, dataLen );
skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, dataLen );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, dataLen );
keccak512_8x64_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -348,9 +348,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
#endif
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
@@ -367,9 +367,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, dataLen );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
@@ -386,15 +386,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, dataLen );
sha512_8x64_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, output );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, dataLen );
haval256_8x32_close( &ctx.haval, output );
return 1;
}
@@ -403,28 +403,28 @@ int xevan_8way_hash( void *output, const void *input, int thr_id )
union _xevan_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
echo_2way_context echo;
#else
hashState_groestl groestl;
hashState_groestl groestl;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
};
typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
@@ -440,12 +440,12 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
const int dataLen = 128;
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
blake512_4way_full( &ctx.blake, vhash, input, 80 );
blake512_4x64_full( &ctx.blake, vhash, input, 80 );
memset( &vhash[8<<2], 0, 64<<2 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -469,15 +469,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, dataLen );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -518,9 +518,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -532,9 +532,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
// Parallel 4way 32 bit
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -546,27 +546,27 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, dataLen );
sha512_4x64_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, vhashA );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashA, dataLen );
haval256_4x32_close( &ctx.haval, vhashA );
rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
blake512_4way_init( &ctx.blake );
blake512_4way_update( &ctx.blake, vhash, dataLen );
blake512_4way_close(&ctx.blake, vhash);
blake512_4x64_init( &ctx.blake );
blake512_4x64_update( &ctx.blake, vhash, dataLen );
blake512_4x64_close(&ctx.blake, vhash);
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, dataLen );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -590,15 +590,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, dataLen );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, dataLen );
keccak512_4x64_close( &ctx.keccak, vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -639,9 +639,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
#endif
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -652,9 +652,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, dataLen );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -665,15 +665,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, dataLen );
sha512_4x64_close( &ctx.sha512, vhash );
rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, output );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashA, dataLen );
haval256_4x32_close( &ctx.haval, output );
return 1;
}

View File

@@ -32,24 +32,24 @@
union _x22i_8way_ctx_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if !defined(X22I_8WAY_SHA)
sha256_8way_context sha256;
sha256_8x32_context sha256;
#endif
#if defined(__VAES__)
groestl512_4way_context groestl;
@@ -88,9 +88,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
unsigned char hashA7[64] __attribute__((aligned(32))) = {0};
x22i_8way_ctx_overlay ctx;
blake512_8way_full( &ctx.blake, vhash, input, 80 );
blake512_8x64_full( &ctx.blake, vhash, input, 80 );
bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 );
#if defined(__VAES__)
@@ -120,15 +120,15 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
#endif
skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
if ( work_restart[thrid].restart ) return 0;
@@ -219,9 +219,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -238,9 +238,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8],
&hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash );
@@ -273,9 +273,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16],
&hash4[16], &hash5[16], &hash6[16], &hash7[16] );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24],
&hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash );
@@ -294,9 +294,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
memset( vhash, 0, 64*8 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, 64 );
haval256_8x32_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -400,9 +400,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7 );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhash, 64 );
sha256_8way_close( &ctx.sha256, output );
sha256_8x32_init( &ctx.sha256 );
sha256_8x32_update( &ctx.sha256, vhash, 64 );
sha256_8x32_close( &ctx.sha256, output );
#endif
@@ -427,8 +427,6 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
@@ -472,8 +470,6 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
@@ -506,8 +502,8 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
union _x22i_4way_ctx_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
echo_2way_context echo;
@@ -516,22 +512,22 @@ union _x22i_4way_ctx_overlay
hashState_echo echo;
#endif
shavite512_2way_context shavite;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if !defined(X22I_4WAY_SHA)
sha256_4way_context sha256;
sha256_4x32_context sha256;
#endif
};
typedef union _x22i_4way_ctx_overlay x22i_ctx_overlay;
@@ -551,11 +547,11 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
unsigned char hashA3[64] __attribute__((aligned(32))) = {0};
x22i_ctx_overlay ctx;
blake512_4way_full( &ctx.blake, vhash, input, 80 );
blake512_4x64_full( &ctx.blake, vhash, input, 80 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
#if defined(__VAES__)
@@ -579,15 +575,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
#endif
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
if ( work_restart[thrid].restart ) return false;
@@ -632,9 +628,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return false;
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
@@ -644,9 +640,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash );
sph_whirlpool_init( &ctx.whirlpool );
@@ -664,9 +660,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash );
if ( work_restart[thrid].restart ) return false;
@@ -680,9 +676,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
memset( vhash, 0, 64*4 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashA, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashA, 64 );
haval256_4x32_close( &ctx.haval, vhash );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
memset( hashA0, 0, 64 );
@@ -743,9 +739,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
sha256_4way_init( &ctx.sha256 );
sha256_4way_update( &ctx.sha256, vhash, 64 );
sha256_4way_close( &ctx.sha256, output );
sha256_4x32_init( &ctx.sha256 );
sha256_4x32_update( &ctx.sha256, vhash, 64 );
sha256_4x32_close( &ctx.sha256, output );
#endif
@@ -770,8 +766,6 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
@@ -814,8 +808,6 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x08ff;
InitializeSWIFFTX();
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );

View File

@@ -33,6 +33,7 @@ bool register_x22i_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT
| AVX512_OPT | VAES_OPT | NEON_OPT;
InitializeSWIFFTX();
return true;
};

View File

@@ -63,29 +63,29 @@ void x25x_shuffle( void *hash )
union _x25x_8way_ctx_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X25X_8WAY_SHA)
sha256_context sha256;
#else
sha256_8way_context sha256;
sha256_8x32_context sha256;
#endif
panama_8way_context panama;
blake2s_8way_state blake2s;
panama_8x32_context panama;
blake2s_8x32_state blake2s;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -99,7 +99,7 @@ union _x25x_8way_ctx_overlay
typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64)));
static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64)));
int x25x_8way_hash( void *output, const void *input, int thrid )
{
@@ -117,15 +117,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ),
blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ),
x25x_8way_midstate );
dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
hash4[0], hash5[0], hash6[0], hash7[0], vhash );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
bmw512_8x64_init( &ctx.bmw );
bmw512_8x64_update( &ctx.bmw, vhash, 64 );
bmw512_8x64_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1],
hash4[1], hash5[1], hash6[1], hash7[1], vhash );
@@ -175,21 +175,19 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
#endif
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
skein512_8x64_full( &ctx.skein, vhash, vhash, 64 );
dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3],
hash4[3], hash5[3], hash6[3], hash7[3], vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4],
hash4[4], hash5[4], hash6[4], hash7[4], vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5],
hash4[5], hash5[5], hash6[5], hash7[5], vhash );
@@ -303,9 +301,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11],
hash4[11], hash5[11], hash6[11], hash7[11], vhash );
@@ -321,9 +319,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12],
hash4[12], hash5[12], hash6[12], hash7[12] );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhash, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13],
hash4[13], hash5[13], hash6[13], hash7[13], vhash );
@@ -354,9 +352,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14],
hash4[14], hash5[14], hash6[14], hash7[14] );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15],
hash4[15], hash5[15], hash6[15], hash7[15], vhash );
@@ -372,9 +370,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
hash4[16], hash5[16], hash6[16], hash7[16] );
memset( vhash, 0, 64*8 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhashA, 64 );
haval256_8x32_close( &ctx.haval, vhash );
dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17],
hash4[17], hash5[17], hash6[17], hash7[17], vhash );
@@ -462,17 +460,17 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20],
hash4[20], hash5[20], hash6[20], hash7[20] );
sha256_8way_init( &ctx.sha256 );
sha256_8way_update( &ctx.sha256, vhashA, 64 );
sha256_8way_close( &ctx.sha256, vhash );
sha256_8x32_init( &ctx.sha256 );
sha256_8x32_update( &ctx.sha256, vhashA, 64 );
sha256_8x32_close( &ctx.sha256, vhash );
dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21],
hash4[21], hash5[21], hash6[21], hash7[21], vhash );
#endif
panama_8way_init( &ctx.panama );
panama_8way_update( &ctx.panama, vhash, 64 );
panama_8way_close( &ctx.panama, vhash );
panama_8x32_init( &ctx.panama );
panama_8x32_update( &ctx.panama, vhash, 64 );
panama_8x32_close( &ctx.panama, vhash );
dintrlv_8x32_512( hash0[22], hash1[22], hash2[22], hash3[22],
hash4[22], hash5[22], hash6[22], hash7[22], vhash );
@@ -545,8 +543,8 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23],
hash4[23], hash5[23], hash6[23], hash7[23] );
blake2s_8way_init( &ctx.blake2s, 32 );
blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
blake2s_8x32_init( &ctx.blake2s, 32 );
blake2s_8x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
return 1;
}
@@ -578,14 +576,13 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) );
blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi64(
7, 6, 5, 4, 3, 2, 1, 0 ) );
blake512_8x64_prehash_le( &blake512_8x64_ctx, x25x_8way_midstate, vdata );
do
{
if ( x25x_8way_hash( hash, vdata, thr_id ) );
if ( x25x_8way_hash( hash, vdata, thr_id ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) )
{
@@ -608,8 +605,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
union _x25x_4way_ctx_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
#if defined(__VAES__)
groestl512_2way_context groestl;
echo_2way_context echo;
@@ -617,34 +614,34 @@ union _x25x_4way_ctx_overlay
hashState_groestl groestl;
hashState_echo echo;
#endif
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
sph_tiger_context tiger;
sph_gost512_context gost;
#if defined(X25X_4WAY_SHA)
sha256_context sha256;
#else
sha256_4way_context sha256;
sha256_4x32_context sha256;
#endif
panama_4way_context panama;
blake2s_4way_state blake2s;
panama_4x32_context panama;
blake2s_4x32_state blake2s;
};
typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64)));
static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64)));
static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64)));
int x25x_4way_hash( void *output, const void *input, int thrid )
{
@@ -658,14 +655,14 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ),
blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ),
x25x_4way_midstate );
dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhash );
dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
#if defined(__VAES__)
@@ -688,19 +685,19 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
#endif
intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhash, vhash, 64 );
dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );
if ( work_restart[thrid].restart ) return 0;
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -751,9 +748,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
if ( work_restart[thrid].restart ) return 0;
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );
fugue512_full( &ctx.fugue, hash0[12], hash0[11], 64 );
@@ -763,9 +760,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );
sph_whirlpool_init(&ctx.whirlpool);
@@ -783,9 +780,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );
ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
@@ -797,9 +794,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
memset( vhash, 0, 64*4 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhashX[0], 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhashX[0], 64 );
haval256_4x32_close( &ctx.haval, vhash );
dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );
sph_tiger_init(&ctx.tiger);
@@ -853,16 +850,16 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
memset( vhash, 0, 64*4 );
sha256_4way_init( &ctx.sha256 );
sha256_4way_update( &ctx.sha256, vhashX[0], 64 );
sha256_4way_close( &ctx.sha256, vhash );
sha256_4x32_init( &ctx.sha256 );
sha256_4x32_update( &ctx.sha256, vhashX[0], 64 );
sha256_4x32_close( &ctx.sha256, vhash );
dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );
#endif
panama_4way_init( &ctx.panama );
panama_4way_update( &ctx.panama, vhash, 64 );
panama_4way_close( &ctx.panama, vhash );
panama_4x32_init( &ctx.panama );
panama_4x32_update( &ctx.panama, vhash, 64 );
panama_4x32_close( &ctx.panama, vhash );
dintrlv_4x32_512( hash0[22], hash1[22], hash2[22], hash3[22], vhash );
laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
@@ -902,8 +899,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] );
intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] );
blake2s_4way_init( &ctx.blake2s, 32 );
blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
blake2s_4x32_init( &ctx.blake2s, 32 );
blake2s_4x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
return 1;
}
@@ -936,9 +933,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) );
mm256_intrlv80_4x64( vdata, edata );
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32(
0, 3, 0, 2, 0, 1, 0, 0 ) );
blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata );
*noncev = _mm256_add_epi32( *noncev, _mm256_set_epi64x( 3, 2, 1, 0 ) );
blake512_4x64_prehash_le( &blake512_4x64_ctx, x25x_4way_midstate, vdata );
do
{

View File

@@ -231,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x25x_hash( hash64, edata, thr_id ) );
if ( x25x_hash( hash64, edata, thr_id ) )
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

View File

@@ -1,692 +0,0 @@
/*-
* Copyright 2009 Colin Percival
* Copyright 2013-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*
* This is a proof-of-work focused fork of yescrypt, including reference and
* cut-down implementation of the obsolete yescrypt 0.5 (based off its first
* submission to PHC back in 2014) and a new proof-of-work specific variation
* known as yespower 1.0. The former is intended as an upgrade for
* cryptocurrencies that already use yescrypt 0.5 and the latter may be used
* as a further upgrade (hard fork) by those and other cryptocurrencies. The
* version of algorithm to use is requested through parameters, allowing for
* both algorithms to co-exist in client and miner implementations (such as in
* preparation for a hard-fork).
*
* This is the reference implementation. Its purpose is to provide a simple
* human- and machine-readable specification that implementations intended
* for actual use should be tested against. It is deliberately mostly not
* optimized, and it is not meant to be used in production. Instead, use
* yespower-opt.c.
*/
/*
#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose."
*/
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "algo/sha/hmac-sha256-hash-4way.h"
//#include "sysendian.h"
#include "yespower.h"
#if defined(__AVX2__)
static void blkcpy_8way( __m256i *dst, const __m256i *src, size_t count )
{
do {
*dst++ = *src++;
} while (--count);
}
static void blkxor_8way( __m256i *dst, const __m256i *src, size_t count )
{
do {
*dst++ ^= *src++;
} while (--count);
}
/**
* salsa20(B):
* Apply the Salsa20 core to the provided block.
*/
static void salsa20_8way( __m256i B[16], uint32_t rounds )
{
__m256i x[16];
size_t i;
/* SIMD unshuffle */
for ( i = 0; i < 16; i++ )
x[i * 5 % 16] = B[i];
for ( i = 0; i < rounds; i += 2 )
{
#define R( a, b, c ) mm256_rol_32( _mm256_add_epi32( a, b ), c )
/* Operate on columns */
x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 0], x[12], 7 ) );
x[ 8] = _mm256_xor_si256( x[ 8], R( x[ 4], x[ 0], 9 ) );
x[12] = _mm256_xor_si256( x[12], R( x[ 8], x[ 4], 13 ) );
x[ 0] = _mm256_xor_si256( x[ 0], R( x[12], x[ 8], 18 ) );
x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 5], x[ 1], 7 ) );
x[13] = _mm256_xor_si256( x[13], R( x[ 9], x[ 5], 9 ) );
x[ 1] = _mm256_xor_si256( x[ 1], R( x[13], x[ 9], 13 ) );
x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 1], x[13], 18 ) );
x[14] = _mm256_xor_si256( x[14], R( x[10], x[ 6], 7 ) );
x[ 2] = _mm256_xor_si256( x[ 2], R( x[14], x[10], 9 ) );
x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 2], x[14], 13 ) );
x[10] = _mm256_xor_si256( x[10], R( x[ 6], x[ 2], 18 ) );
x[ 3] = _mm256_xor_si256( x[ 3], R( x[15], x[11], 7 ) );
x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 3], x[15], 9 ) );
x[11] = _mm256_xor_si256( x[11], R( x[ 7], x[ 3], 13 ) );
x[15] = _mm256_xor_si256( x[15], R( x[11], x[ 7], 18 ) );
/* Operate on rows */
x[ 1] = _mm256_xor_si256( x[ 1], R( x[ 0], x[ 3], 7 ) );
x[ 2] = _mm256_xor_si256( x[ 2], R( x[ 1], x[ 0], 9 ) );
x[ 3] = _mm256_xor_si256( x[ 3], R( x[ 2], x[ 1], 13 ) );
x[ 0] = _mm256_xor_si256( x[ 0], R( x[ 3], x[ 2], 18 ) );
x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 5], x[ 4], 7 ) );
x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 6], x[ 5], 9 ) );
x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 7], x[ 6], 13 ) );
x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 4], x[ 7], 18 ) );
x[11] = _mm256_xor_si256( x[11], R( x[10], x[ 9], 7 ) );
x[ 8] = _mm256_xor_si256( x[ 8], R( x[11], x[10], 9 ) );
x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 8], x[11], 13 ) );
x[10] = _mm256_xor_si256( x[10], R( x[ 9], x[ 8], 18 ) );
x[12] = _mm256_xor_si256( x[12], R( x[15], x[14], 7 ) );
x[13] = _mm256_xor_si256( x[13], R( x[12], x[15], 9 ) );
x[14] = _mm256_xor_si256( x[14], R( x[13], x[12], 13 ) );
x[15] = _mm256_xor_si256( x[15], R( x[14], x[13], 18 ) );
#undef R
}
/* SIMD shuffle */
for (i = 0; i < 16; i++)
B[i] = _mm256_add_epi32( B[i], x[i * 5 % 16] );
}
/**
* blockmix_salsa(B):
* Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in
* length.
*/
static void blockmix_salsa_8way( __m256i *B, uint32_t rounds )
{
__m256i X[16];
size_t i;
/* 1: X <-- B_{2r - 1} */
blkcpy_8way( X, &B[16], 16 );
/* 2: for i = 0 to 2r - 1 do */
for ( i = 0; i < 2; i++ )
{
/* 3: X <-- H(X xor B_i) */
blkxor_8way( X, &B[i * 16], 16 );
salsa20_8way( X, rounds );
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
blkcpy_8way( &B[i * 16], X, 16 );
}
}
/*
* These are tunable, but they must meet certain constraints and are part of
* what defines a yespower version.
*/
#define PWXsimple 2
#define PWXgather 4
/* Version 0.5 */
#define PWXrounds_0_5 6
#define Swidth_0_5 8
/* Version 1.0 */
#define PWXrounds_1_0 3
#define Swidth_1_0 11
/* Derived values. Not tunable on their own. */
#define PWXbytes (PWXgather * PWXsimple * 8)
#define PWXwords (PWXbytes / sizeof(uint32_t))
#define rmin ((PWXbytes + 127) / 128)
/* Runtime derived values. Not tunable on their own. */
#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8)
#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8)
typedef struct {
__m256i (*S0)[2], (*S1)[2], (*S2)[2];
__m256i *S;
yespower_version_t version;
uint32_t salsa20_rounds;
uint32_t PWXrounds, Swidth, Sbytes, Smask;
size_t w;
} pwxform_8way_ctx_t __attribute__ ((aligned (128)));
/**
* pwxform(B):
* Transform the provided block using the provided S-boxes.
*/
static void pwxform_8way( __m256i *B, pwxform_8way_ctx_t *ctx )
{
__m256i (*X)[PWXsimple][2] = (__m256i (*)[PWXsimple][2])B;
__m256i (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2;
__m256i Smask = _mm256_set1_epi32( ctx->Smask );
size_t w = ctx->w;
size_t i, j, k;
/* 1: for i = 0 to PWXrounds - 1 do */
for ( i = 0; i < ctx->PWXrounds; i++ )
{
/* 2: for j = 0 to PWXgather - 1 do */
for ( j = 0; j < PWXgather; j++ )
{
// Are these pointers or data?
__m256i xl = X[j][0][0];
__m256i xh = X[j][0][1];
__m256i (*p0)[2], (*p1)[2];
// 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8)
// playing with pointers
/*
p0 = S0 + (xl & Smask) / sizeof(*S0);
// 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8)
p1 = S1 + (xh & Smask) / sizeof(*S1);
*/
/* 5: for k = 0 to PWXsimple - 1 do */
for ( k = 0; k < PWXsimple; k++ )
{
// shift from 32 bit data to 64 bit data
__m256i x0, x1, s00, s01, s10, s11;
__m128i *p0k = (__m128i*)p0[k];
__m128i *p1k = (__m128i*)p1[k];
s00 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[0] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[2] ), 32 ) );
s01 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[1] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[3] ), 32 ) );
s10 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[0] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[2] ), 32 ) );
s11 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[1] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[3] ), 32 ) );
__m128i *xx = (__m128i*)X[j][k];
x0 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[0] ),
_mm256_cvtepu32_epi64( xx[2] ) );
x1 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[1] ),
_mm256_cvtepu32_epi64( xx[3] ) );
x0 = _mm256_add_epi64( x0, s00 );
x1 = _mm256_add_epi64( x1, s01 );
x0 = _mm256_xor_si256( x0, s10 );
x1 = _mm256_xor_si256( x1, s11 );
X[j][k][0] = x0;
X[j][k][1] = x1;
}
if ( ctx->version != YESPOWER_0_5 &&
( i == 0 || j < PWXgather / 2 ) )
{
if ( j & 1 )
{
for ( k = 0; k < PWXsimple; k++ )
{
S1[w][0] = X[j][k][0];
S1[w][1] = X[j][k][1];
w++;
}
}
else
{
for ( k = 0; k < PWXsimple; k++ )
{
S0[w + k][0] = X[j][k][0];
S0[w + k][1] = X[j][k][1];
}
}
}
}
}
if ( ctx->version != YESPOWER_0_5 )
{
/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
ctx->S0 = S2;
ctx->S1 = S0;
ctx->S2 = S1;
/* 15: w <-- w mod 2^Swidth */
ctx->w = w & ( ( 1 << ctx->Swidth ) * PWXsimple - 1 );
}
}
/**
* blockmix_pwxform(B, ctx, r):
* Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be
* 128r bytes in length.
*/
static void blockmix_pwxform_8way( uint32_t *B, pwxform_8way_ctx_t *ctx,
size_t r )
{
__m256i X[PWXwords];
size_t r1, i;
/* Convert 128-byte blocks to PWXbytes blocks */
/* 1: r_1 <-- 128r / PWXbytes */
r1 = 128 * r / PWXbytes;
/* 2: X <-- B'_{r_1 - 1} */
blkcpy_8way( X, &B[ (r1 - 1) * PWXwords ], PWXwords );
/* 3: for i = 0 to r_1 - 1 do */
for ( i = 0; i < r1; i++ )
{
/* 4: if r_1 > 1 */
if ( r1 > 1 )
{
/* 5: X <-- X xor B'_i */
blkxor_8way( X, &B[ i * PWXwords ], PWXwords );
}
/* 7: X <-- pwxform(X) */
pwxform_8way( X, ctx );
/* 8: B'_i <-- X */
blkcpy_8way( &B[ i * PWXwords ], X, PWXwords );
}
/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
i = ( r1 - 1 ) * PWXbytes / 64;
/* 11: B_i <-- H(B_i) */
salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
#if 1 /* No-op with our current pwxform settings, but do it to make sure */
/* 12: for i = i + 1 to 2r - 1 do */
for ( i++; i < 2 * r; i++ )
{
/* 13: B_i <-- H(B_i xor B_{i-1}) */
blkxor_8way( &B[i * 16], &B[ (i - 1) * 16 ], 16 );
salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
}
#endif
}
// This looks a lot like data dependent addressing
/**
* integerify(B, r):
* Return the result of parsing B_{2r-1} as a little-endian integer.
*/
static __m256i integerify8( const __m256i *B, size_t r )
{
/*
* Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but
* we only care about the least significant 32 bits anyway.
*/
const __m256i *X = &B[ (2 * r - 1) * 16 ];
return X[0];
}
/**
* p2floor(x):
* Largest power of 2 not greater than argument.
*/
static uint32_t p2floor8( uint32_t x )
{
uint32_t y;
while ( ( y = x & (x - 1) ) )
x = y;
return x;
}
/**
* wrap(x, i):
* Wrap x to the range 0 to i-1.
*/
static uint32_t wrap8( uint32_t x, uint32_t i )
{
uint32_t n = p2floor( i );
return ( x & (n - 1) ) + (i - n);
}
/**
* smix1(B, r, N, V, X, ctx):
* Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage X must be 128r bytes in length.
*/
static void smix1_8way( __m256i *B, size_t r, uint32_t N,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
{
size_t s = 32 * r;
uint32_t i, j;
size_t k;
/* 1: X <-- B */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
if ( ctx->version != YESPOWER_0_5 )
{
for ( k = 1; k < r; k++ )
{
blkcpy_8way( &X[k * 32], &X[ (k - 1) * 32 ], 32 );
blockmix_pwxform_8way( &X[k * 32], ctx, 1 );
}
}
/* 2: for i = 0 to N - 1 do */
for ( i = 0; i < N; i++ )
{
/* 3: V_i <-- X */
blkcpy_8way( &V[i * s], X, s );
if ( i > 1 )
{
// is j int or vector? Integrify has data dependent addressing?
/* j <-- Wrap(Integerify(X), i) */
// j = wrap8( integerify8( X, r ), i );
/* X <-- X xor V_j */
blkxor_8way( X, &V[j * s], s );
}
/* 4: X <-- H(X) */
if ( V != ctx->S )
blockmix_pwxform_8way( X, ctx, r );
else
blockmix_salsa_8way( X, ctx->salsa20_rounds );
}
/* B' <-- X */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
}
/**
* smix2(B, r, N, Nloop, V, X, ctx):
* Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage X must be 128r bytes in length. The value N must be a power of 2
* greater than 1.
*/
static void smix2_8way( __m256i *B, size_t r, uint32_t N, uint32_t Nloop,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
{
size_t s = 32 * r;
uint32_t i, j;
size_t k;
/* X <-- B */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
/* 6: for i = 0 to N - 1 do */
for ( i = 0; i < Nloop; i++ )
{
/* 7: j <-- Integerify(X) mod N */
// j = integerify8(X, r) & (N - 1);
/* 8.1: X <-- X xor V_j */
blkxor_8way( X, &V[j * s], s );
/* V_j <-- X */
if ( Nloop != 2 )
blkcpy_8way( &V[j * s], X, s );
/* 8.2: X <-- H(X) */
blockmix_pwxform_8way( X, ctx, r );
}
/* 10: B' <-- X */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
}
/**
* smix(B, r, N, p, t, V, X, ctx):
* Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the
* temporary storage V must be 128rN bytes in length; the temporary storage
* X must be 128r bytes in length. The value N must be a power of 2 and at
* least 16.
*/
static void smix_8way( __m256i *B, size_t r, uint32_t N,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx)
{
uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
uint32_t Nloop_rw = Nloop_all;
Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
if ( ctx->version == YESPOWER_0_5 )
Nloop_rw &= ~(uint32_t)1; /* round down to even */
else
Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
smix1_8way( B, 1, ctx->Sbytes / 128, ctx->S, X, ctx );
smix1_8way( B, r, N, V, X, ctx );
smix2_8way( B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx );
smix2_8way( B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx );
}
/**
* yespower(local, src, srclen, params, dst):
* Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
*
* Return 0 on success; or -1 on error.
*/
int yespower_8way( yespower_local_t *local, const __m256i *src, size_t srclen,
const yespower_params_t *params, yespower_8way_binary_t *dst,
int thrid )
{
yespower_version_t version = params->version;
uint32_t N = params->N;
uint32_t r = params->r;
const uint8_t *pers = params->pers;
size_t perslen = params->perslen;
int retval = -1;
size_t B_size, V_size;
uint32_t *B, *V, *X, *S;
pwxform_8way_ctx_t ctx;
__m256i sha256[8];
/* Sanity-check parameters */
if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0 ) ||
N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
(N & (N - 1)) != 0 || r < rmin ||
(!pers && perslen) )
{
errno = EINVAL;
return -1;
}
/* Allocate memory */
B_size = (size_t)128 * r;
V_size = B_size * N;
if ((V = malloc(V_size)) == NULL)
return -1;
if ((B = malloc(B_size)) == NULL)
goto free_V;
if ((X = malloc(B_size)) == NULL)
goto free_B;
ctx.version = version;
if (version == YESPOWER_0_5) {
ctx.salsa20_rounds = 8;
ctx.PWXrounds = PWXrounds_0_5;
ctx.Swidth = Swidth_0_5;
ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth);
} else {
ctx.salsa20_rounds = 2;
ctx.PWXrounds = PWXrounds_1_0;
ctx.Swidth = Swidth_1_0;
ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth);
}
if ((S = malloc(ctx.Sbytes)) == NULL)
goto free_X;
ctx.S = S;
ctx.S0 = (__m256i (*)[2])S;
ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple;
ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple;
ctx.Smask = Swidth_to_Smask(ctx.Swidth);
ctx.w = 0;
// do prehash
sha256_8way_full( sha256, src, srclen );
// need flexible size, use malloc;
__m256i vpers[128];
if ( version != YESPOWER_0_5 && perslen )
for ( int i = 0; i < perslen/4 + 1; i++ )
vpers[i] = _mm256_set1_epi32( pers[i] );
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
pbkdf2_sha256_8way( B, B_size, sha256, sizeof(sha256), vpers, perslen, 1 );
blkcpy_8way( sha256, B, sizeof(sha256) / sizeof(sha256[0] ) );
/* 3: B_i <-- MF(B_i, N) */
smix_8way( B, r, N, V, X, &ctx );
if ( version == YESPOWER_0_5 )
{
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
pbkdf2_sha256_8way( dst, sizeof(*dst), sha256, sizeof(sha256),
B, B_size, 1 );
if ( pers )
{
hmac_sha256_8way_full( dst, sizeof(*dst), vpers, perslen, sha256 );
sha256_8way_full( dst, sha256, sizeof(sha256) );
}
}
else
hmac_sha256_8way_full( dst, B + B_size - 64, 64, sha256, sizeof(sha256) );
/* Success! */
retval = 1;
/* Free memory */
free(S);
free_X:
free(X);
free_B:
free(B);
free_V:
free(V);
return retval;
}
int yespower_8way_tls( const __m256i *src, size_t srclen,
const yespower_params_t *params, yespower_8way_binary_t *dst, int trhid )
{
/* The reference implementation doesn't use thread-local storage */
return yespower_8way( NULL, src, srclen, params, dst, trhid );
}
int yespower_init_local8( yespower_local_t *local )
{
/* The reference implementation doesn't use the local structure */
local->base = local->aligned = NULL;
local->base_size = local->aligned_size = 0;
return 0;
}
int yespower_free_local8( yespower_local_t *local )
{
/* The reference implementation frees its memory in yespower() */
(void)local; /* unused */
return 0;
}
int yespower_8way_hash( const char *input, char *output, uint32_t len,
int thrid )
{
return yespower_8way_tls( input, len, &yespower_params,
(yespower_binary_t*)output, thrid );
}
int scanhash_yespower_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8*8];
uint32_t _ALIGN(128) vdata[20*8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
// do sha256 prehash
SHA256_Init( &sha256_prehash_ctx );
SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
do {
if ( yespower_hash( vdata, hash, 80, thr_id ) )
if unlikely( valid_hash( hash, ptarget ) && !opt_benchmark )
{
be32enc( pdata+19, n );
submit_solution( work, hash, mythr );
}
endiandata[19] = ++n;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
#endif // AVX2

View File

@@ -1,7 +1,7 @@
#!/bin/bash
#
# This script is not intended for users, it is only used for compile testing
# during develpment. However the information contained may provide compilation
# during development. However, the information contained may provide compilation
# tips to users.
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null

View File

@@ -10,9 +10,9 @@ rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl
# Rocketlake needs gcc-11
#CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl
CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes
@@ -21,7 +21,7 @@ mv cpuminer cpuminer-avx512-sha-vaes
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
#make -j 8
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-alderlake
@@ -30,35 +30,43 @@ mv cpuminer cpuminer-avx512-sha-vaes
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
#make -j 8
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
# Apparently Granitrapids will not include AVX10, SHA512 or APX,
# Granitrapids does not build with AVX10, SHA512 or APX.
# wait for Diamondrapids & gcc-15.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
#make -j 8
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-graniterapids
# Force AVX10-256
# SHA512 AVX10.1
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
#make -j 8
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-256
#mv cpuminer cpuminer-avx10_1
# Force SHA512 AVX10-512
# SHA512 AVX10.2
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
#make -j 8
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.2 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-avx10-512
#mv cpuminer cpuminer-avx10_2
# Diamondrapids: AVX10.2, SHA512, APX; needs GCC-15 & CPU with APX to compile.
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=diamondrapids -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-diamondrapids
# Zen5: AVX512 SHA VAES, requires gcc-14.
#make clean || echo clean
@@ -71,11 +79,10 @@ mv cpuminer cpuminer-avx512-sha-vaes
# Zen4: AVX512 SHA VAES
make clean || echo clean
rm -f config.status
# znver3 needs gcc-11, znver4 needs gcc-12.3.
# Zen4: AVX512, SHA, VAES, needs gcc-12.3.
#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
# Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen4
@@ -83,7 +90,6 @@ mv cpuminer cpuminer-zen4
# Zen3 AVX2 SHA VAES
make clean || echo clean
rm -f config.status
#CFLAGS="-O3 -march=znver2 -mvaes" ./configure --with-curl
CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
@@ -159,7 +165,7 @@ mv cpuminer cpuminer-ssse3
# SSE2
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
CFLAGS="-O3 -march=x86-64 -msse2 -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse2

View File

@@ -1,27 +1,9 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
#!/bin/sh
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
make distclean || echo clean
rm -f config.status

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/bin/sh
#
# make clean and rm all the targetted executables.

View File

@@ -3,7 +3,7 @@
#ifdef WIN32
#if _WIN32_WINNT==0x0601 // Windows 7
#if _WIN32_WINNT>=0x0601 // Windows 7
#define WINDOWS_CPU_GROUPS_ENABLED 1
#endif

227
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.1.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.1'
PACKAGE_STRING='cpuminer-opt 25.1'
PACKAGE_VERSION='25.4'
PACKAGE_STRING='cpuminer-opt 25.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -657,10 +657,6 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
X86_64_APPLE_FALSE
X86_64_APPLE_TRUE
ARM64_APPLE_FALSE
ARM64_APPLE_TRUE
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
@@ -669,8 +665,6 @@ ARCH_ARM64_FALSE
ARCH_ARM64_TRUE
ARCH_x86_64_FALSE
ARCH_x86_64_TRUE
ARCH_x86_FALSE
ARCH_x86_TRUE
USE_ASM_FALSE
USE_ASM_TRUE
HAVE_WINDOWS_FALSE
@@ -802,7 +796,6 @@ enable_maintainer_mode
enable_dependency_tracking
enable_assembly
with_curl
with_crypto
'
ac_precious_vars='build_alias
host_alias
@@ -1366,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1438,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
esac
cat <<\_ACEOF
@@ -1461,7 +1454,6 @@ Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
--with-curl=PATH prefix where curl is installed default=/usr
--with-crypto=PATH prefix where openssl crypto is installed default=/usr
Some influential environment variables:
CC C compiler command
@@ -1544,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.1
cpuminer-opt configure 25.4
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1991,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.1, which was
It was created by cpuminer-opt $as_me 25.4, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3599,7 +3591,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.1'
VERSION='25.4'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6509,22 +6501,11 @@ fi
case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm4=true
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
@@ -6557,126 +6538,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
printf %s "checking whether we can compile AVX code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vmovdqa %ymm0, %ymm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
printf %s "checking whether we can compile XOP code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vprotd \$7, %xmm0, %xmm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_XOP 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
printf %s "checking whether we can compile AVX2 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %ymm0, %ymm1, %ymm2");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
printf %s "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else $as_nop
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
printf %s "checking for json_loads in -ljansson... " >&6; }
if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6885,9 +6747,6 @@ fi
fi
#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
printf %s "checking whether __uint128_t is supported... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -6941,14 +6800,6 @@ else
USE_ASM_FALSE=
fi
if test x$have_x86 = xtrue; then
ARCH_x86_TRUE=
ARCH_x86_FALSE='#'
else
ARCH_x86_TRUE='#'
ARCH_x86_FALSE=
fi
if test x$have_x86_64 = xtrue; then
ARCH_x86_64_TRUE=
ARCH_x86_64_FALSE='#'
@@ -6981,28 +6832,6 @@ else
HAVE_APPLE_FALSE=
fi
if test x$have_arm64_apple = xtrue; then
ARM64_APPLE_TRUE=
ARM64_APPLE_FALSE='#'
else
ARM64_APPLE_TRUE='#'
ARM64_APPLE_FALSE=
fi
if test x$have_x86_64_apple = xtrue; then
X86_64_APPLE_TRUE=
X86_64_APPLE_FALSE='#'
else
X86_64_APPLE_TRUE='#'
X86_64_APPLE_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
else
JANSSON_LIBS=-ljansson
fi
# libcurl install path (for mingw : --with-curl=/usr/local)
@@ -7020,30 +6849,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
# Check whether --with-crypto was given.
if test ${with_crypto+y}
then :
withval=$with_crypto;
fi
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7220,10 +7029,6 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
as_fn_error $? "conditional \"USE_ASM\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -7240,14 +7045,6 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7638,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.1, which was
This file was extended by cpuminer-opt $as_me 25.4, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7706,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.1
cpuminer-opt config.status 25.4
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [25.1])
AC_INIT([cpuminer-opt], [25.4])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -42,22 +42,11 @@ AC_FUNC_ALLOCA
AC_CHECK_FUNCS([getopt_long])
case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm4=true
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
@@ -84,42 +73,7 @@ if test x$enable_assembly != xno; then
AC_DEFINE([USE_ASM], [1], [Define to 1 if assembly routines are wanted.])
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
AC_MSG_CHECKING(whether we can compile AVX code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile XOP code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
)
AC_MSG_CHECKING(whether we can compile AVX2 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
)
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
@@ -128,9 +82,6 @@ AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
))))
#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
AC_MSG_CHECKING(whether __uint128_t is supported)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
AC_DEFINE(USE_INT128, 1, [Define if __uint128_t is available])
@@ -143,19 +94,10 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([static __uint128_t i = 100;])],
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
AM_CONDITIONAL([ARM64_APPLE], [test x$have_arm64_apple = xtrue])
AM_CONDITIONAL([X86_64_APPLE], [test x$have_x86_64_apple = xtrue])
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
else
JANSSON_LIBS=-ljansson
fi
# libcurl install path (for mingw : --with-curl=/usr/local)
AC_ARG_WITH([curl],
@@ -168,25 +110,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
AC_ARG_WITH([crypto],
[ --with-crypto=PATH prefix where openssl crypto is installed [default=/usr]])
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.1.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.1'
PACKAGE_STRING='cpuminer-opt 25.1'
PACKAGE_VERSION='25.4'
PACKAGE_STRING='cpuminer-opt 25.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -650,10 +650,6 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
X86_64_APPLE_FALSE
X86_64_APPLE_TRUE
ARM64_APPLE_FALSE
ARM64_APPLE_TRUE
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
@@ -662,8 +658,6 @@ ARCH_ARM64_FALSE
ARCH_ARM64_TRUE
ARCH_x86_64_FALSE
ARCH_x86_64_TRUE
ARCH_x86_FALSE
ARCH_x86_TRUE
USE_ASM_FALSE
USE_ASM_TRUE
HAVE_WINDOWS_FALSE
@@ -796,7 +790,6 @@ enable_maintainer_mode
enable_dependency_tracking
enable_assembly
with_curl
with_crypto
'
ac_precious_vars='build_alias
host_alias
@@ -1359,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
'configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1431,7 +1424,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.4:";;
esac
cat <<\_ACEOF
@@ -1454,7 +1447,6 @@ Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
--without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no)
--with-curl=PATH prefix where curl is installed default=/usr
--with-crypto=PATH prefix where openssl crypto is installed default=/usr
Some influential environment variables:
CC C compiler command
@@ -1536,7 +1528,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.1
cpuminer-opt configure 25.4
generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1957,7 +1949,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.1, which was
It was created by cpuminer-opt $as_me 25.4, which was
generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3772,7 +3764,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.1'
VERSION='25.4'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6464,22 +6456,11 @@ fi
case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_arm4=true
have_arm64=true
;;
powerpc*-*-*)
have_ppc=true
@@ -6512,130 +6493,7 @@ printf "%s\n" "#define USE_ASM 1" >>confdefs.h
fi
if test x$enable_assembly != xno -a x$have_x86_64 = xtrue
then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
printf %s "checking whether we can compile AVX code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vmovdqa %ymm0, %ymm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
printf %s "checking whether we can compile XOP code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vprotd \$7, %xmm0, %xmm1");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_XOP 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else case e in #(
e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
;;
esac
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
printf %s "checking whether we can compile AVX2 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %ymm0, %ymm1, %ymm2");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX2 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
printf %s "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main (void)
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"
then :
printf "%s\n" "#define USE_AVX512 1" >>confdefs.h
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else case e in #(
e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
;;
esac
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else case e in #(
e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
;;
esac
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
else case e in #(
e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
printf "%s\n" "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
;;
esac
fi
rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
fi
# jansson test fails on Linux/Mingw, handled in Makefile.am.
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
printf %s "checking for json_loads in -ljansson... " >&6; }
if test ${ac_cv_lib_jansson_json_loads+y}
@@ -6888,9 +6746,6 @@ esac
fi
#LDFLAGS="$PTHREAD_LDFLAGS $LDFLAGS"
# PTHREAD_LIBS="$PTHREAD_LIBS"
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether __uint128_t is supported" >&5
printf %s "checking whether __uint128_t is supported... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -6945,14 +6800,6 @@ else
USE_ASM_FALSE=
fi
if test x$have_x86 = xtrue; then
ARCH_x86_TRUE=
ARCH_x86_FALSE='#'
else
ARCH_x86_TRUE='#'
ARCH_x86_FALSE=
fi
if test x$have_x86_64 = xtrue; then
ARCH_x86_64_TRUE=
ARCH_x86_64_FALSE='#'
@@ -6985,28 +6832,6 @@ else
HAVE_APPLE_FALSE=
fi
if test x$have_arm64_apple = xtrue; then
ARM64_APPLE_TRUE=
ARM64_APPLE_FALSE='#'
else
ARM64_APPLE_TRUE='#'
ARM64_APPLE_FALSE=
fi
if test x$have_x86_64_apple = xtrue; then
X86_64_APPLE_TRUE=
X86_64_APPLE_FALSE='#'
else
X86_64_APPLE_TRUE='#'
X86_64_APPLE_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
else
JANSSON_LIBS=-ljansson
fi
# libcurl install path (for mingw : --with-curl=/usr/local)
@@ -7024,30 +6849,10 @@ if test -n "$with_curl" ; then
LIBCURL="-lcurl -lz"
fi
# SSL install path (for mingw : --with-crypto=/usr/local/ssl)
# Check whether --with-crypto was given.
if test ${with_crypto+y}
then :
withval=$with_crypto;
fi
if test -n "$with_crypto" ; then
LIBCURL_CFLAGS="$LIBCURL_CFLAGS -I$with_crypto/include"
LIBCURL_CPPFLAGS="$LIBCURL_CPPFLAGS -I$with_crypto/include"
LIBCURL_LDFLAGS="-L$with_crypto/lib $LIBCURL_LDFLAGS"
LIBCURL="$LIBCURL -lssl -lcrypto"
fi
CFLAGS="$CFLAGS $LIBCURL_CFLAGS"
CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS"
#AC_CHECK_LIB([z],[gzopen],[],[])
#AC_CHECK_LIB([crypto],[OPENSSL_init], crypto=yes, [AC_MSG_ERROR([OpenSSL crypto library required])])
#AC_CHECK_LIB([ssl],[SSL_new], ssl=yes, ssl=no)
# AC_CHECK_LIB([curl], [curl_multi_timeout],
# have_libcurl=yes,
# have_libcurl=no AC_MSG_ERROR([curl library required])
@@ -7236,10 +7041,6 @@ if test -z "${USE_ASM_TRUE}" && test -z "${USE_ASM_FALSE}"; then
as_fn_error $? "conditional \"USE_ASM\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -7256,14 +7057,6 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7657,7 +7450,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.1, which was
This file was extended by cpuminer-opt $as_me 25.4, which was
generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7725,7 +7518,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.1
cpuminer-opt config.status 25.4
configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\"

View File

@@ -152,7 +152,7 @@ int stratum_thr_id = -1;
int api_thr_id = -1;
bool stratum_need_reset = false;
struct work_restart *work_restart = NULL;
struct stratum_ctx stratum;
struct stratum_ctx stratum = {0};
double opt_diff_factor = 1.0;
double opt_target_factor = 1.0;
uint32_t zr5_pok = 0;
@@ -187,7 +187,7 @@ static bool opt_api_enabled = false;
char *opt_api_allow = NULL;
int opt_api_listen = 0;
int opt_api_remote = 0;
char *default_api_allow = "127.0.0.1";
const char *default_api_allow = "127.0.0.1";
int default_api_listen = 4048;
pthread_mutex_t applog_lock;
@@ -286,15 +286,15 @@ static inline void drop_policy(void) { }
static void affine_to_cpu( struct thr_info *thr )
{
int thread = thr->id;
unsigned long last_error;
bool ok;
unsigned long last_error = 0;
bool ok = true;
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
unsigned long group_size = GetActiveProcessorCount( 0 );
unsigned long group = thread / group_size;
unsigned long cpu = thread_affinity_map[ thread % group_size ];
GROUP_AFFINITY affinity;
GROUP_AFFINITY affinity = {0};
affinity.Group = group;
affinity.Mask = 1ULL << cpu;
@@ -320,8 +320,7 @@ static void affine_to_cpu( struct thr_info *thr )
{
last_error = GetLastError();
if ( !thread )
applog( LOG_WARNING, "Set affinity returned error 0x%x for thread %d",
last_error, thread );
applog( LOG_WARNING, "Set affinity returned error 0x%x", last_error );
}
}
@@ -871,9 +870,9 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
work->tx_count = tx_count;
/* assemble block header */
algo_gate.build_block_header( work, swab32( version ),
algo_gate.build_block_header( work, bswap_32( version ),
(uint32_t*) prevhash, (uint32_t*) merkle_tree,
swab32( curtime ), le32dec( &bits ),
bswap_32( curtime ), le32dec( &bits ),
final_sapling_hash );
if ( unlikely( !jobj_binary( val, "target", target, sizeof(target) ) ) )
@@ -1774,7 +1773,7 @@ static bool get_work(struct thr_info *thr, struct work *work)
// why 74? std cmp_size is 76, std data is 128
for ( int n = 0; n < 74; n++ ) ( (char*)work->data )[n] = n;
work->data[algo_gate.ntime_index] = swab32(ts); // ntime
work->data[algo_gate.ntime_index] = bswap_32(ts); // ntime
// this overwrites much of the for loop init
memset( work->data + algo_gate.nonce_index, 0x00, 52); // nonce..nonce+52
@@ -2010,36 +2009,37 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
{
bool new_job;
pthread_rwlock_wrlock( &g_work_lock );
pthread_mutex_lock( &sctx->work_lock );
new_job = sctx->new_job; // otherwise just increment extranonce2
sctx->new_job = false;
pthread_rwlock_wrlock( &g_work_lock );
free( g_work->job_id );
g_work->job_id = strdup( sctx->job.job_id );
g_work->xnonce2_len = sctx->xnonce2_size;
g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
g_work->height = sctx->block_height;
g_work->targetdiff = sctx->job.diff
/ ( opt_target_factor * opt_diff_factor );
memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
algo_gate.build_extraheader( g_work, sctx );
net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] );
algo_gate.set_work_data_endian( g_work );
g_work->height = sctx->block_height;
g_work->targetdiff = sctx->job.diff
/ ( opt_target_factor * opt_diff_factor );
diff_to_hash( g_work->target, g_work->targetdiff );
g_work_time = time(NULL);
restart_threads();
pthread_rwlock_unlock( &g_work_lock );
// Pre increment extranonce2 in case of being called again before receiving
// a new job
for ( int t = 0;
t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
t++ );
g_work_time = time(NULL);
restart_threads();
pthread_mutex_unlock( &sctx->work_lock );
pthread_rwlock_unlock( &g_work_lock );
pthread_mutex_lock( &stats_lock );
@@ -2073,7 +2073,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
// Update data and calculate new estimates.
if ( ( stratum_diff != sctx->job.diff )
|| ( last_block_height != sctx->block_height ) )
|| ( last_block_height != sctx->block_height ) )
{
if ( unlikely( !session_first_block ) )
session_first_block = stratum.block_height;
@@ -2190,7 +2190,7 @@ static void *miner_thread( void *userdata )
}
// wait for stratum to send first job
if ( have_stratum ) while ( unlikely( stratum_down ) )
if ( have_stratum ) while ( unlikely( !stratum.job.job_id ) )
{
if ( opt_debug )
applog( LOG_INFO, "Thread %d waiting for first job", thr_id );
@@ -2204,7 +2204,6 @@ static void *miner_thread( void *userdata )
{
uint64_t hashes_done;
struct timeval tv_start, tv_end, diff;
// int64_t max64 = 1000;
int nonce_found = 0;
if ( have_stratum )
@@ -2230,13 +2229,6 @@ static void *miner_thread( void *userdata )
}
else if ( !opt_benchmark ) // GBT or getwork
{
// max64 is used to set end_nonce to match the scantime.
// It also factors the nonce range to end the scan when nonces are
// exhausted. In either case needing new work can be assumed.
// Only problem is every thread will call get_work.
// First thread resets scantime blocking all subsequent threads
// from fetching new work.
pthread_rwlock_wrlock( &g_work_lock );
const time_t now = time(NULL);
if ( ( ( now - g_work_time ) >= opt_scantime )
@@ -2873,12 +2865,12 @@ static bool cpu_capability( bool display_only )
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_avx10_256 = false;
bool sw_has_avx10_512 = false;
bool sw_has_avx10 = false;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
@@ -2901,7 +2893,7 @@ static bool cpu_capability( bool display_only )
bool use_sha512;
bool use_neon;
bool use_none;
*/
#if defined(__x86_64__)
sw_has_x86_64 = true;
#elif defined(__aarch64__)
@@ -2913,6 +2905,7 @@ static bool cpu_capability( bool display_only )
sw_arm_arch = __ARM_ARCH;
#endif
#endif
// x86_64 only
#if defined(__SSE2__)
sw_has_sse2 = true;
@@ -2935,15 +2928,18 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true;
#endif
#if defined(__AVX10_1_256__)
sw_has_avx10_256 = true;
#endif
#if defined(__AVX10_1_512__)
sw_has_avx10_512 = true;
// AVX10 version is not significant as of AVX10.2. If that changes use a better
// way to test the version than sequentially.
// #if defined(__AVX10_2__)
//
// #elif defined(__AVX10_1__)
#if defined(__AVX10_1__)
sw_has_avx10 = true;
#endif
// x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
sw_has_aes = true;
sw_has_aes = true;
#endif
#ifdef __VAES__
sw_has_vaes = true;
@@ -2954,6 +2950,7 @@ static bool cpu_capability( bool display_only )
#if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512)
sw_has_sha512 = true;
#endif
// AArch64 only
#if defined(__ARM_NEON)
sw_has_neon = true;
@@ -2971,16 +2968,20 @@ static bool cpu_capability( bool display_only )
sw_has_sme2 = true;
#endif
// CPU
cpu_brand_string( cpu_brand );
printf( "CPU: %s\n", cpu_brand );
printf("SW built on " __DATE__
// Build
printf( "SW built on " __DATE__
#if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__);
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
__clang_patchlevel__ );
#elif defined(__GNUC__)
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif
// OS
#if defined(__linux)
printf(" Linux\n");
#elif defined(WIN32)
@@ -3001,8 +3002,7 @@ static bool cpu_capability( bool display_only )
printf("CPU features: ");
if ( cpu_arch_x86_64() )
{
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(),
avx10_vector_length() );
if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
@@ -3027,8 +3027,7 @@ static bool cpu_capability( bool display_only )
printf("\nSW features: ");
if ( sw_has_x86_64 )
{
if ( sw_has_avx10_512 ) printf( " AVX10-512" );
else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
if ( sw_has_avx10 ) printf( " AVX10 " );
else if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
@@ -3051,123 +3050,15 @@ static bool cpu_capability( bool display_only )
if ( sw_has_sha512 ) printf( " SHA512" );
else if ( sw_has_sha256 ) printf( " SHA256" );
if ( !display_only )
{
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
if ( algo_has_avx512 ) printf( " AVX512" );
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_neon ) printf( " NEON" );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha512 ) printf( " SHA512" );
else if ( algo_has_sha256 ) printf( " SHA256" );
}
}
printf("\n");
if ( display_only ) return true;
// Determine mining options
use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2;
use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42;
use_avx = cpu_has_avx && sw_has_avx && algo_has_avx;
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
use_aes = cpu_has_aes && sw_has_aes && algo_has_aes;
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes;
use_sha256 = cpu_has_sha256 && sw_has_sha256 && algo_has_sha256;
use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
use_neon = sw_has_aarch64 && sw_has_neon && algo_has_neon;
use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
|| use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon );
// Display best options
if ( !use_none )
{
applog_nl( "Enabled optimizations:" );
if ( use_neon ) printf( " NEON" );
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_avx ) printf( " AVX" );
else if ( use_sse42 ) printf( " SSE42" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );
if ( use_sha512 ) printf( " SHA512" );
else if ( use_sha256 ) printf( " SHA256" );
printf( "\n" );
}
return true;
}
void show_version_and_exit(void)
{
printf("\n built on " __DATE__
#ifdef _MSC_VER
" with VC++ 2013\n");
#elif defined(__GNUC__)
" with GCC");
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#endif
printf(" features:"
#if defined(USE_ASM) && defined(__i386__)
" i386"
#endif
#if defined(USE_ASM) && defined(__x86_64__)
" x86_64"
#endif
#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__))
" SSE2"
#endif
#if defined(__x86_64__) && defined(USE_AVX)
" AVX"
#endif
#if defined(__x86_64__) && defined(USE_AVX2)
" AVX2"
#endif
#if defined(__x86_64__) && defined(USE_XOP)
" XOP"
#endif
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
" ARM"
#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \
defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \
defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
defined(__ARM_ARCH_7__) || \
defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \
defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
" ARMv5E"
#endif
#if defined(__ARM_NEON__)
" NEON"
#endif
#endif
"\n\n");
printf("%s\n", curl_version());
#ifdef JANSSON_VERSION
printf("jansson/%s ", JANSSON_VERSION);
#endif
#ifdef PTW32_VERSION
printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION);
#endif
printf("\n");
exit(0);
}
void show_usage_and_exit(int status)
{
if (status)
fprintf(stderr, "Try `--help' for more information.\n");
// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
else
printf(usage);
exit(status);
@@ -3183,7 +3074,6 @@ void parse_arg(int key, char *arg )
{
char *p;
int v, i;
// uint64_t ul;
double d;
switch( key )
@@ -3242,7 +3132,7 @@ void parse_arg(int key, char *arg )
else if ( arg )
{
/* port or 0 to disable */
opt_api_allow = default_api_allow;
opt_api_allow = (char*)default_api_allow;
opt_api_listen = atoi(arg);
}
break;
@@ -3278,7 +3168,7 @@ void parse_arg(int key, char *arg )
// debug overrides quiet
case 'q': // quiet
if ( !( opt_debug || opt_protocol ) ) opt_quiet = true;
opt_quiet = !( opt_debug || opt_protocol );
break;
case 'D': // debug
opt_debug = true;
@@ -3327,7 +3217,8 @@ void parse_arg(int key, char *arg )
free(rpc_user);
rpc_user = strdup(arg);
break;
case 'o': // url
case 'o': // url
{
char *ap, *hp;
ap = strstr( arg, "://" );
@@ -3392,7 +3283,8 @@ void parse_arg(int key, char *arg )
have_stratum = !opt_benchmark && !strncasecmp( rpc_url, "stratum", 7 );
break;
}
case 'O': // userpass
case 'O': // userpass
p = strchr(arg, ':');
if (!p)
{
@@ -3552,10 +3444,10 @@ void parse_arg(int key, char *arg )
case 1029: // stratum-keepalive
opt_stratum_keepalive = true;
break;
case 'V':
case 'V': // version
display_cpu_capability();
exit(0);
case 'h':
case 'h': // help
show_usage_and_exit(0);
default:
@@ -3864,12 +3756,23 @@ int main(int argc, char *argv[])
}
#endif
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#if defined(WIN32)
#if defined(_WIN32_WINNT)
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT );
#else
if (opt_debug)
applog( LOG_INFO, "_WIN232_WINNT undefined." );
#endif
#if defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#endif
#endif
conditional_state = malloc( opt_n_threads * ((sizeof(bool)) ) );
memset( conditional_state, 0, opt_n_threads * ((sizeof(bool)) ) );
@@ -3890,7 +3793,7 @@ int main(int argc, char *argv[])
if ( cpu < num_cpus ) active_cpus++;
}
if ( opt_n_threads > active_cpus )
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
applog( LOG_WARNING, "More miner threads (%d) than active CPUs in affinity mask (%d)", opt_n_threads, active_cpus );
if ( !opt_quiet )
{
char affinity_mask[64];

35
miner.h
View File

@@ -8,8 +8,8 @@
#define USER_AGENT_ARCH "x64" // Intel, AMD x86_64
#elif defined(__aarch64__)
#define USER_AGENT_ARCH "arm" // AArch64
//#elif
// #define USER_AGENT_ARCH "r5" // RISC-V
#elif defined(__riscv)
#define USER_AGENT_ARCH "rv" // RISC-V
#else
#define USER_AGENT_ARCH
#endif
@@ -65,7 +65,7 @@
# endif
#endif
// no mm_maloc for Neon
// no mm_malloc for Neon
#if !defined(__ARM_NEON)
#include <mm_malloc.h>
@@ -173,6 +173,7 @@ static inline bool is_windows(void)
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif
#if 0
// deprecated, see simd-int.h
#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
#define WANT_BUILTIN_BSWAP
@@ -183,6 +184,7 @@ static inline bool is_windows(void)
*/
#endif
/*
static inline uint32_t swab32(uint32_t x)
{
#ifdef WANT_BUILTIN_BSWAP
@@ -195,6 +197,8 @@ static inline uint32_t swab32(uint32_t x)
// return bswap_32(v);
#endif
}
*/
#endif
// Swap any two variables of the same type without using a temp
#define swap_vars(a,b) a^=b; b^=a; a^=b;
@@ -289,26 +293,6 @@ static inline void le16enc(void *pp, uint16_t x)
json_t* json_load_url(char* cfg_url, json_error_t *err);
//void sha256_init(uint32_t *state);
//void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
//void sha256d(unsigned char *hash, const unsigned char *data, int len);
#ifdef USE_ASM
#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
#define HAVE_SHA256_4WAY 1
int sha256_use_4way();
void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif
//#if defined(__x86_64__) && defined(USE_AVX2)
#if defined(__x86_64__) && defined(__AVX2__)
#define HAVE_SHA256_8WAY 1
int sha256_use_8way();
void sha256_init_8way(uint32_t *state);
void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
#endif
#endif
struct work;
void work_free(struct work *w);
@@ -851,10 +835,9 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
allium Garlicoin (GRLC)\n\
anime Animecoin (ANI)\n\
argon2 Argon2 Coin (AR2)\n\
argon2d250\n\
argon2d500 argon2d-dyn, Dynamic (DYN)\n\
argon2d4096 argon2d-uis, Unitus (UIS)\n\
argon2d500\n\
argon2d4096\n\
axiom Shabal-256 MemoHash\n\
blake blake256r14 (SFR)\n\
blake2b Blake2b 256\n\

View File

@@ -140,28 +140,28 @@
#include <stdint.h>
#include <stddef.h>
// AVX512 macros are not a reliable indicator of 512 bit vector capability
// because they get defined with AVX10_1_256 which doesn't support 512 bit.
// EVEX512 is also unreliable as it can also be defined when 512b is not
// available.
// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
// Use AVX512 macros only whithout AVX10.
/*
// Test for macros
#ifdef __AVX10_1__
// Test for AVX10 macros
// AVX10-256 was abandoned by Intel before any CPUs were built.
#ifdef __AVX10__ // does not exist
#warning "__AVX10__"
#endif
#ifdef __AVX10_1__ // GCC-14
#warning "__AVX10_1__"
#endif
#ifdef __AVX10_1_256__
#ifdef __AVX10_2__ // GCC-15
#warning "__AVX10_2__"
#endif
#ifdef __AVX10_1_256__ // obsolete
#warning "__AVX10_1_256__"
#endif
#ifdef __AVX10_1_512__
#warning "__AVX10_1_512__"
#ifdef __AVX10_1_512__
#warning "__AVX10_1_512__" // does not exist
#endif
#ifdef __EVEX256__
#warning "__EVEX256__"
#ifdef __EVEX256__ // likely obsolete
#warning "__EVEX256__"
#endif
#ifdef __EVEX512__
#ifdef __EVEX512__ // likely obsolete
#warning "__EVEX512__"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -169,27 +169,14 @@
#endif
*/
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
// must be tested seperately.
// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
// VBMI: Include AVX512VBMI instructions for supported vector lengths.
#if defined(__AVX10_1__)
#define VL256 1
#define VBMI 1
#if defined(__AVX10_1_512__)
#define SIMD512 1
#endif
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define VL256 1
// With Intel abandoning AVX10-256 the SIM512 & VL256 macros are almost
// identical with the only difference being VBMI is included in VL256.
#if defined(__AVX10_1__) || ( defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) )
#define SIMD512 1
#if defined(__AVX512VBMI__)
#define VL256 1
#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
#define VBMI 1
#endif
#endif
/*
@@ -204,13 +191,75 @@
#endif
*/
// targetted intrinsics
#if defined(__x86_64__)
#include <x86intrin.h>
#elif defined(__aarch64__) && defined(__ARM_NEON)
#include <arm_neon.h>
#elif defined(__riscv) && defined(__riscv_vector)
#include <riscv_vector.h>
#endif
#include <x86intrin.h>
// Single global definition for frequently used vector constants.
// The GCC optimizer can merge constants but merging different vector lengths
// might be beyond it's scope.
#elif defined(__aarch64__)
// Frequently used SSE/AVX shuffle constants.
#if defined(SIMD512)
#include <arm_neon.h>
// When used with shuffle_epi8 performs are standard bswap of all elements.
// When used with permutexvar_epi8 (requires AVX512VBMI or AVX10) performs a
// bswap of the elements in the lower 128 bits of the source and broadcasts
// the result to all 128 bit lanes of the destination.
extern const __m512i V512_BSWAP64;
#define V256_BSWAP64 _mm512_castsi512_si256( V512_BSWAP64 )
#define V128_BSWAP64 _mm512_castsi512_si128( V512_BSWAP64 )
extern const __m512i V512_BSWAP32;
#define V256_BSWAP32 _mm512_castsi512_si256( V512_BSWAP32 )
#define V128_BSWAP32 _mm512_castsi512_si128( V512_BSWAP32 )
#elif defined(__AVX2__)
extern const __m256i V256_BSWAP64;
#define V128_BSWAP64 _mm256_castsi256_si128( V256_BSWAP64 )
extern const __m256i V256_BSWAP32;
#define V128_BSWAP32 _mm256_castsi256_si128( V256_BSWAP32 )
// These shuffles aren't needed with AVX512, uses ror/rol instead.
extern const __m256i V256_SHUFLR64_8;
#define V128_SHUFLR64_8 _mm256_castsi256_si128( V256_SHUFLR64_8 )
extern const __m256i V256_SHUFLR64_24;
#define V128_SHUFLR64_24 _mm256_castsi256_si128( V256_SHUFLR64_24 )
extern const __m256i V256_SHUFLL64_8;
#define V128_SHUFLL64_8 _mm256_castsi256_si128( V256_SHUFLL64_8 )
extern const __m256i V256_SHUFLL64_24;
#define V128_SHUFLL64_24 _mm256_castsi256_si128( V256_SHUFLL64_24 )
extern const __m256i V256_SHUFLR32_8;
#define V128_SHUFLR32_8 _mm256_castsi256_si128( V256_SHUFLR32_8 )
extern const __m256i V256_SHUFLL32_8;
#define V128_SHUFLL32_8 _mm256_castsi256_si128( V256_SHUFLL32_8 )
#elif defined(__SSSE3__)
extern const __m128i V128_BSWAP64;
extern const __m128i V128_BSWAP32;
extern const __m128i V128_SHUFLR64_8;
extern const __m128i V128_SHUFLR64_24;
extern const __m128i V128_SHUFLL64_8;
extern const __m128i V128_SHUFLL64_24;
extern const __m128i V128_SHUFLR32_8;
extern const __m128i V128_SHUFLL32_8;
#endif
@@ -225,7 +274,7 @@
// x86_64 AVX512 512 bit vectors
#include "simd-utils/simd-512.h"
// aarch64 neon 128 bit vectors
// aarch64 NEON 128 bit vectors
#include "simd-utils/simd-neon.h"
#include "simd-utils/intrlv.h"

View File

@@ -589,20 +589,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ];
}
#if defined(__SSSE3__)
static inline void v128_bswap32_80( void *d, void *s )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf );
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf );
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf );
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf );
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
#if defined(__SSSE3__) || defined(__ARM_NEON)
static inline void v128_bswap32_80( void *d, void *s )
{
@@ -641,6 +628,8 @@ static inline void v128_bswap32_80( void *d, void *s )
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128u32_t s0 = casti_v128u32( src,0 );
@@ -649,27 +638,12 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
v128u32_t s3 = casti_v128u32( src,3 );
v128u32_t s4 = casti_v128u32( src,4 );
#if defined(__SSSE3__)
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
#else
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
@@ -696,6 +670,8 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
}
#endif // SSE2 || NEON
// 8x32
#if defined(__AVX2__)
@@ -1112,8 +1088,6 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m256i c1 = v256_32( 1 );
const __m256i c2 = _mm256_add_epi32( c1, c1 );
const __m256i c3 = _mm256_add_epi32( c2, c1 );
@@ -1124,11 +1098,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 );
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32(
@@ -1617,8 +1591,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_32( 1 );
const __m512i c2 = _mm512_add_epi32( c1, c1 );
const __m512i c3 = _mm512_add_epi32( c2, c1 );
@@ -1628,11 +1600,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d, 0 ) = _mm512_broadcastd_epi32( s0 );
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1,
@@ -1878,6 +1850,8 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
{
v128u64_t s0 = casti_v128u64( src,0 );
@@ -1886,27 +1860,12 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
v128u64_t s3 = casti_v128u64( src,3 );
v128u64_t s4 = casti_v128u64( src,4 );
#if defined(__SSSE3__)
const v128u64_t bswap_shuf = v128_set64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
#else
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
#endif
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
@@ -1923,6 +1882,8 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
}
#endif // SSE2 || NEON
static inline void extr_lane_2x64( void *dst, const void *src,
const int lane, const int bit_len )
{
@@ -2233,25 +2194,23 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i bswap_shuf = mm256_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
__m256i s0 = casti_m256i( src,0 ); // s0, s1
__m256i s2 = casti_m256i( src,1 ); // s2, s3
v128_t s4 = casti_v128( src,4 );
s0 = _mm256_shuffle_epi8( s0, bswap_shuf );
s1 = _mm256_shuffle_epi8( s1, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) );
s0 = mm256_bswap_32( s0 );
s2 = mm256_bswap_32( s2 );
s4 = v128_bswap32( s4 );
casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 );
casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 );
casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa );
casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff );
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 );
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 );
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa );
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff );
casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s2, 0x00 );
casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s2, 0x55 );
casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s2, 0xaa );
casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s2, 0xff );
casti_m256i( d, 8 ) = _mm256_permute4x64_epi64(
_mm256_castsi128_si256( s4 ), 0x00 );
@@ -2648,8 +2607,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
const __m512i c1 = v512_64( 1 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
@@ -2657,11 +2614,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d,0 ) = _mm512_broadcastq_epi64( s0 );
casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1,
@@ -2842,49 +2799,45 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const v128_t s0 = casti_v128( src,0 );
const v128_t s1 = casti_v128( src,1 );
const v128_t s2 = casti_v128( src,2 );
const v128_t s3 = casti_v128( src,3 );
const v128_t s4 = casti_v128( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
bswap_shuf );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
bswap_shuf );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
bswap_shuf );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
bswap_shuf );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s0 ) );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s1 ) );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s2 ) );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s3 ) );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( V512_BSWAP32,
_mm512_castsi128_si512( s4 ) );
}
#else
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
casti_m512i( d,0 ) = mm512_bcast128( s0 );
casti_m512i( d,1 ) = mm512_bcast128( s1 );
casti_m512i( d,2 ) = mm512_bcast128( s2 );
casti_m512i( d,3 ) = mm512_bcast128( s3 );
casti_m512i( d,4 ) = mm512_bcast128( s4 );
}
#endif // AVX512VBMI ELSE

View File

@@ -521,29 +521,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#elif defined(__SSSE3__)
// SSSE3: fastest 32 bit, very fast 16, fast 8
#define v128_shuflr64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
#define v128_shufll64_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
#define v128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#define v128_shufll64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
#define v128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#define v128_shufll32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0e0d0c0f0a09080b, 0x0605040702010003 ) )
#define v128_shuflr64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
@@ -612,74 +595,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// (v1 ^ v0) >>> n, ARM NEON has optimized version
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
#if defined(VL256)
#define v128_2ror64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
_mm_ror_epi64( v1, c )
#define v128_2rol64( v1, v0, c ) \
_mm_rol_epi64( v0, c ); \
_mm_rol_epi64( v1, c )
#define v128_2ror32( v1, v0, c ) \
_mm_ror_epi32( v0, c ); \
_mm_ror_epi32( v1, c )
#define v128_2rol32( v1, v0, c ) \
_mm_rol_epi32( v0, c ); \
_mm_rol_epi32( v1, c )
#else // SSE2
#define v128_2ror64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
__m128i t1 = _mm_srli_epi64( v1, c ); \
v0 = _mm_slli_epi64( v0, 64-(c) ); \
v1 = _mm_slli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2rol64( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi64( v0, c ); \
__m128i t1 = _mm_slli_epi64( v1, c ); \
v0 = _mm_srli_epi64( v0, 64-(c) ); \
v1 = _mm_srli_epi64( v1, 64-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2ror32( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi32( v0, c ); \
__m128i t1 = _mm_srli_epi32( v1, c ); \
v0 = _mm_slli_epi32( v0, 32-(c) ); \
v1 = _mm_slli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#define v128_2rol32( v1, v0, c ) \
{ \
__m128i t0 = _mm_slli_epi32( v0, c ); \
__m128i t1 = _mm_slli_epi32( v1, c ); \
v0 = _mm_srli_epi32( v0, 32-(c) ); \
v1 = _mm_srli_epi32( v1, 32-(c) ); \
v0 = _mm_or_si256( v0, t0 ); \
v1 = _mm_or_si256( v1, t1 ); \
}
#endif // AVX512 else SSE2
*/
// Cross lane shuffles
// No NEON version
@@ -721,13 +636,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
0x08090a0b0c0d0e0f ) )
#define v128_bswap64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define v128_bswap64( v ) _mm_shuffle_epi8( v, V128_BSWAP64 )
#define v128_bswap32( v ) _mm_shuffle_epi8( v, V128_BSWAP32 )
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
@@ -735,85 +647,30 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define v128_block_bswap64( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define v128_block_bswap64_512 v128_block_bswap64
#define v128_block_bswap64_1024( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
casti_v128( d,0 ) = v128_bswap64( casti_v128( s,0 ) ); \
casti_v128( d,1 ) = v128_bswap64( casti_v128( s,1 ) ); \
casti_v128( d,2 ) = v128_bswap64( casti_v128( s,2 ) ); \
casti_v128( d,3 ) = v128_bswap64( casti_v128( s,3 ) ); \
casti_v128( d,4 ) = v128_bswap64( casti_v128( s,4 ) ); \
casti_v128( d,5 ) = v128_bswap64( casti_v128( s,5 ) ); \
casti_v128( d,6 ) = v128_bswap64( casti_v128( s,6 ) ); \
casti_v128( d,7 ) = v128_bswap64( casti_v128( s,7 ) ); \
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define v128_block_bswap32( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
casti_v128( d,0 ) = v128_bswap32( casti_v128( s,0 ) ); \
casti_v128( d,1 ) = v128_bswap32( casti_v128( s,1 ) ); \
casti_v128( d,2 ) = v128_bswap32( casti_v128( s,2 ) ); \
casti_v128( d,3 ) = v128_bswap32( casti_v128( s,3 ) ); \
casti_v128( d,4 ) = v128_bswap32( casti_v128( s,4 ) ); \
casti_v128( d,5 ) = v128_bswap32( casti_v128( s,5 ) ); \
casti_v128( d,6 ) = v128_bswap32( casti_v128( s,6 ) ); \
casti_v128( d,7 ) = v128_bswap32( casti_v128( s,7 ) ); \
}
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_128( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
}
#define v128_block_bswap32_512( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}
#else // SSE2
static inline v128_t v128_bswap64( __m128i v )
@@ -835,7 +692,7 @@ static inline v128_t v128_bswap16( __m128i v )
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
#define v128_bswap128( v ) v128_rev64( v128_bswap64( v ) )
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
{
@@ -849,26 +706,6 @@ static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
d[7] = v128_bswap64( s[7] );
}
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
d[ 2] = v128_bswap64( s[ 2] );
d[ 3] = v128_bswap64( s[ 3] );
d[ 4] = v128_bswap64( s[ 4] );
d[ 5] = v128_bswap64( s[ 5] );
d[ 6] = v128_bswap64( s[ 6] );
d[ 7] = v128_bswap64( s[ 7] );
d[ 8] = v128_bswap64( s[ 8] );
d[ 9] = v128_bswap64( s[ 9] );
d[10] = v128_bswap64( s[10] );
d[11] = v128_bswap64( s[11] );
d[14] = v128_bswap64( s[12] );
d[13] = v128_bswap64( s[13] );
d[14] = v128_bswap64( s[14] );
d[15] = v128_bswap64( s[15] );
}
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap32( s[0] );
@@ -882,26 +719,6 @@ static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
}
#define v128_block_bswap32_256 v128_block_bswap32
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
d[ 2] = v128_bswap32( s[ 2] );
d[ 3] = v128_bswap32( s[ 3] );
d[ 4] = v128_bswap32( s[ 4] );
d[ 5] = v128_bswap32( s[ 5] );
d[ 6] = v128_bswap32( s[ 6] );
d[ 7] = v128_bswap32( s[ 7] );
d[ 8] = v128_bswap32( s[ 8] );
d[ 9] = v128_bswap32( s[ 9] );
d[10] = v128_bswap32( s[10] );
d[11] = v128_bswap32( s[11] );
d[12] = v128_bswap32( s[12] );
d[13] = v128_bswap32( s[13] );
d[14] = v128_bswap32( s[14] );
d[15] = v128_bswap32( s[15] );
}
#endif // SSSE3 else SSE2
// alignr instruction for 32 & 64 bit elements is only available with AVX512

View File

@@ -61,8 +61,10 @@ typedef union
#if defined(__AVX2__)
// Broadcast, ie set1, from 128 bit vector input.
#define mm256_bcast_m128( v ) \
#define mm256_bcast128( v ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
// deprecated
#define mm256_bcast_m128 mm256_bcast128
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
@@ -73,23 +75,23 @@ typedef union
#else
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
#define mm256_bcast128lo_64( i64 ) mm256_bcast128( v128_mov64( i64 ) )
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
#endif
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
#define mm256_set2_64( i1, i0 ) mm256_bcast128( _mm_set_epi64x( i1, i0 ) )
#define mm256_set4_32( i3, i2, i1, i0 ) \
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
mm256_bcast128( _mm_set_epi32( i3, i2, i1, i0 ) )
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_128 mm256_bcast_m128( v128_one )
#define m256_one_128 mm256_bcast128( v128_one )
static inline __m256i mm256_neg1_fn()
{
@@ -231,21 +233,8 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_swap64_32 mm256_qrev32 // grandfathered
#define mm256_qrev16(v) mm256_shuffle16( v, 0x1b )
#define mm256_qrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 )
#define mm256_lrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm256_wrev8(v) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
//
// Bit rotations.
@@ -268,50 +257,33 @@ static inline __m256i mm256_not( const __m256i v )
#if defined(VL256)
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
// Redundant but naming may be a better fit in some applications.
#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
#define mm256_shuflr64_8( v) _mm256_ror_epi64( v, 8 )
#define mm256_shufll64_8( v) _mm256_rol_epi64( v, 8 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 )
#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 )
#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 )
#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 )
#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 )
#else
// ROR & ROL will always find the fastest but these names may be a better fit
// in some applications.
#define mm256_shuflr64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) )
#define mm256_shufll64_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) )
#define mm256_shuflr64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) )
#define mm256_shufll64_24( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) )
#define mm256_shuflr32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) )
#define mm256_shufll32_8( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( \
_mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) )
#define mm256_shuflr64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_8 )
#define mm256_shufll64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_8 )
#define mm256_shuflr64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_24 )
#define mm256_shufll64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_24 )
#define mm256_shuflr32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR32_8 )
#define mm256_shufll32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL32_8 )
#define mm256_ror_64( v, c ) \
( (c) == 8 ) ? mm256_shuflr64_8( v ) \
@@ -347,96 +319,6 @@ static inline __m256i mm256_not( const __m256i v )
#endif
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for AVX2, does nothing for AVX512 but is here for
// transparency.
#if defined(VL256)
/*
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
#define mm256_ror_32 _mm256_ror_epi32
#define mm256_rol_32 _mm256_rol_epi32
*/
#define mm256_rorx2_64( v1, v0, c ) \
_mm256_ror_epi64( v0, c ); \
_mm256_ror_epi64( v1, c )
#define mm256_rolx2_64( v1, v0, c ) \
_mm256_rol_epi64( v0, c ); \
_mm256_rol_epi64( v1, c )
#define mm256_rorx2_32( v1, v0, c ) \
_mm256_ror_epi32( v0, c ); \
_mm256_ror_epi32( v1, c )
#define mm256_rolx2_32( v1, v0, c ) \
_mm256_rol_epi32( v0, c ); \
_mm256_rol_epi32( v1, c )
#else // AVX2
/*
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
*/
#define mm256_rorx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi64( v0, c ); \
__m256i t1 = _mm256_srli_epi64( v1, c ); \
v0 = _mm256_slli_epi64( v0, 64-(c) ); \
v1 = _mm256_slli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_64( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi64( v0, c ); \
__m256i t1 = _mm256_slli_epi64( v1, c ); \
v0 = _mm256_srli_epi64( v0, 64-(c) ); \
v1 = _mm256_srli_epi64( v1, 64-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rorx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_srli_epi32( v0, c ); \
__m256i t1 = _mm256_srli_epi32( v1, c ); \
v0 = _mm256_slli_epi32( v0, 32-(c) ); \
v1 = _mm256_slli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#define mm256_rolx2_32( v1, v0, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( v0, c ); \
__m256i t1 = _mm256_slli_epi32( v1, c ); \
v0 = _mm256_srli_epi32( v0, 32-(c) ); \
v1 = _mm256_srli_epi32( v1, 32-(c) ); \
v0 = _mm256_or_si256( v0, t0 ); \
v1 = _mm256_or_si256( v1, t1 ); \
}
#endif // AVX512 else AVX2
#if defined(__AVX2__)
// 128 bit version of unpack
@@ -453,20 +335,14 @@ static inline __m256i mm256_not( const __m256i v )
//
// Cross lane shuffles
//
// Rotate elements accross all lanes.
#define mm256_shuffle_16( v, c ) \
_mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
_mm256_shufflelo_epi16( v, c ) )
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_rev_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_swap_128 mm256_rev_128 // grandfathered
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Reverse 64 bit elements
/* not used
// Reverse elements
#define mm256_rev_64( v ) _mm256_permute4x64_epi64( v, 0x1b )
#define mm256_rev_32( v ) \
@@ -474,7 +350,12 @@ static inline __m256i mm256_not( const __m256i v )
0x0000000400000005, 0x0000000600000007 )
#define mm256_rev_16( v ) \
_mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )
_mm256_permute4x64_epi64( mm256_shuffle16( v, 0x1b ), 0x4e )
*/
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
/* Not used
// Rotate 256 bit vector by one 32 bit element.
@@ -486,7 +367,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
#else
#define mm256_shuflr_32( v ) \
_mm256_permutevar8x32_epi32( v, \
_mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \
_mm256_set_epi64x( 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ) )
#define mm256_shufll_32( v ) \
_mm256_permutevar8x32_epi32( v, \
@@ -507,113 +388,64 @@ static inline __m256i mm256_shufll_32( const __m256i v )
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
_mm256_castsi256_ps( v2 ), c ) );
#define mm256_swap128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_rev128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_swap128_64 mm256_rev128_64 // grandfathered
/*not used
#define mm256_rev128_32(v) _mm256_shuffle_epi32( v, 0x1b )
#define mm256_rev128_16(v) mm256_shuffle_16( v, 0x1b )
#define mm256_rev128_16(v) mm256_shuffle16( v, 0x1b )
*/
#define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 )
#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 )
/* not used
#define mm256_shuflr128_16(v) mm256_shuffle16( v, 0x39 )
#define mm256_shufll128_16(v) mm256_shuffle16( v, 0x93 )
/* Not used
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
*/
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm256_bswap_64( v ) _mm256_shuffle_epi8( v, V256_BSWAP64 )
#define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm256_bswap_32( v ) _mm256_shuffle_epi8( v, V256_BSWAP32 )
/* not used
#define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \
_mm256_shuffle_epi8( v, mm256_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
//
*/
// Source and destination are pointers, may point to same memory.
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes
#define mm256_block_bswap_64( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
}
#define mm256_block_bswap64_512 mm256_block_bswap_64
#define mm256_block_bswap64_1024( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
casti_m256i( d,0 ) = mm256_bswap_64( casti_m256i( s,0 ) ); \
casti_m256i( d,1 ) = mm256_bswap_64( casti_m256i( s,1 ) ); \
casti_m256i( d,2 ) = mm256_bswap_64( casti_m256i( s,2 ) ); \
casti_m256i( d,3 ) = mm256_bswap_64( casti_m256i( s,3 ) ); \
casti_m256i( d,4 ) = mm256_bswap_64( casti_m256i( s,4 ) ); \
casti_m256i( d,5 ) = mm256_bswap_64( casti_m256i( s,5 ) ); \
casti_m256i( d,6 ) = mm256_bswap_64( casti_m256i( s,6 ) ); \
casti_m256i( d,7 ) = mm256_bswap_64( casti_m256i( s,7 ) ); \
}
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes
#define mm256_block_bswap_32( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 0 ) = mm256_bswap_32( casti_m256i( s, 0 ) ); \
casti_m256i( d, 1 ) = mm256_bswap_32( casti_m256i( s, 1 ) ); \
casti_m256i( d, 2 ) = mm256_bswap_32( casti_m256i( s, 2 ) ); \
casti_m256i( d, 3 ) = mm256_bswap_32( casti_m256i( s, 3 ) ); \
casti_m256i( d, 4 ) = mm256_bswap_32( casti_m256i( s, 4 ) ); \
casti_m256i( d, 5 ) = mm256_bswap_32( casti_m256i( s, 5 ) ); \
casti_m256i( d, 6 ) = mm256_bswap_32( casti_m256i( s, 6 ) ); \
casti_m256i( d, 7 ) = mm256_bswap_32( casti_m256i( s, 7 ) ); \
}
#define mm256_block_bswap32_256 mm256_block_bswap_32
#define mm256_block_bswap32_512( d, s ) \
{ \
__m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \
casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \
casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \
casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \
casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \
casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \
casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \
casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \
casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \
casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \
casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \
}
#if defined(VL256)
#define mm256_alignr64 _mm256_alignr_epi64

View File

@@ -108,11 +108,13 @@ typedef union
// A simple 128 bit permute, using function instead of macro avoids
// problems if the v arg passed as an expression.
static inline __m512i mm512_perm_128( const __m512i v, const int c )
static inline __m512i mm512_perm128( const __m512i v, const int c )
{ return _mm512_shuffle_i64x2( v, v, c ); }
// Broadcast 128 bit vector to all lanes of 512 bit vector.
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
#define mm512_bcast128( v ) mm512_perm128( _mm512_castsi128_si512( v ), 0 )
// deprecated
#define mm512_bcast_m128 mm512_bcast128
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
@@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_bcast128hi_64( i64 ) _mm512_maskz_set1_epi64( 0xaa, i64 )
#define mm512_set2_64( i1, i0 ) \
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
mm512_bcast128( _mm_set_epi64x( i1, i0 ) )
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
@@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Reverse byte order of packed elements, vectorized endian conversion.
#define mm512_bswap_64( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) )
#define mm512_bswap_64( v ) _mm512_shuffle_epi8( v, V512_BSWAP64 )
#define mm512_bswap_32( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
*/
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
// Source and destination are pointers, may point to same memory.
// 8 lanes of 64 bytes each
#define mm512_block_bswap_64( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
}
#define mm512_block_bswap64_512 mm512_block_bswap_64
#define mm512_block_bswap64_1024( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \
casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \
casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \
casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \
casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \
casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \
casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \
casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \
}
// 16 lanes of 32 bytes each
#define mm512_block_bswap_32( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \
casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \
casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \
casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \
casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \
casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \
casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \
casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \
}
#define mm512_block_bswap32_256 mm512_block_bswap_32
#define mm512_block_bswap32_512( d, s ) \
{ \
const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \
casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \
casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \
casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \
casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \
casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \
casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \
casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \
}
// Cross-lane shuffles implementing rotation of packed elements.
//
// shuffle 16 bit elements within 64 bit lanes.
#define mm512_shuffle16( v, c ) \
_mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c )
// Rotate elements across entire vector.
static inline __m512i mm512_swap_256( const __m512i v )
static inline __m512i mm512_rev_256( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 4 ); }
#define mm512_shuflr_256 mm512_swap_256
#define mm512_shufll_256 mm512_swap_256
#define mm512_swap_256 mm512_rev_256 // grandfathered
static inline __m512i mm512_shuflr_128( const __m512i v )
{ return _mm512_alignr_epi64( v, v, 2 ); }
@@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
// Rotate elements within 256 bit lanes of 512 bit vector.
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_shuflr256_128 mm512_swap256_128
#define mm512_shufll256_128 mm512_swap256_128
#define mm512_rev256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_swap256_128 mm512_rev256_128 // grandfathered
// Rotate 256 bit lanes by one 64 bit element
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
@@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
#define mm512_rev128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_swap128_64 mm512_rev128_64 // grandfathered
/*not used
#define mm512_rev128_32(v) _mm526_shuffle_epi32( v, 0x1b )
#define mm512_rev128_16(v) mm512_shuffle16( v, 0x1b )
*/
// Rotate 128 bit lanes by one 32 bit element
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
/* Not used
#define mm512_shuflr128_16(v) mm512_shuffle16( v, 0x39 )
#define mm512_shufll128_16(v) mm512_shuffle16( v, 0x93 )
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
@@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes
// Not really necessary with AVX512, included for consistency with AVX2/SSE.
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
@@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
/* Not used
// 32 bit lanes
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
#define mm512_lrev16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )

View File

@@ -0,0 +1,55 @@
#include "simd-utils.h"
#if defined(SIMD512)
const __m512i V512_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f };
const __m512i V512_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b };
#elif defined(__AVX2__)
const __m256i V256_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f,
0x0001020304050607, 0x08090a0b0c0d0e0f };
const __m256i V256_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b,
0x0405060700010203, 0x0c0d0e0f08090a0b };
const __m256i V256_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09,
0x0007060504030201, 0x080f0e0d0c0b0a09 };
const __m256i V256_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b,
0x0201000706050403, 0x0a09080f0e0d0c0b };
const __m256i V256_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f,
0x0605040302010007, 0x0e0d0c0b0a09080f };
const __m256i V256_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d,
0x0403020100070605, 0x0c0b0a09080f0e0d };
const __m256i V256_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09,
0x0407060500030201, 0x0c0f0e0d080b0a09 };
const __m256i V256_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b,
0x0605040702010003, 0x0e0d0c0f0a09080b };
#elif defined(__SSSE3__)
const v128_t V128_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
const v128_t V128_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
const v128_t V128_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09 };
const v128_t V128_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b };
const v128_t V128_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f };
const v128_t V128_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d };
const v128_t V128_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09 };
const v128_t V128_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b };
#endif

View File

@@ -14,10 +14,10 @@
// veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0
// vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n )
// vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 )
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
//
// not used anywhere yet
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n )
// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 )
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
@@ -124,7 +124,7 @@
// ~v1 & v0
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( a ^ b ), same as (~a) ^ b
// ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64
@@ -136,8 +136,11 @@
// known way to test arm minor version.
#if defined(__ARM_FEATURE_SHA3)
#define v128_xor3 veor3q_u32
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32( v3, veor3q_u32( v2, v1, v0 ) )
#else
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
#define v128_xor4( v3, v2, v1, v0 ) veorq_u32 ( veorq_u32( v3, v2 ), \
veorq_u32( v1, v0 ) )
#endif
// v2 & v1 & v0
@@ -153,13 +156,13 @@
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#endif
// a ^ ( b & c )
// v2 ^ ( v1 & v0 )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
// a & ( b ^ c )
// v2 & ( v1 ^ v0 )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
// a ^ ( b | c )
// v2 ^ ( v1 | v0 )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
// v2 | ( v1 & v0 )
@@ -240,7 +243,7 @@ typedef union
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
#define castp_v128u32( p ) ((uint32x4_t*)(p))
// set1
// set1, integer argument
#define v128_64 vmovq_n_u64
#define v128_32 vmovq_n_u32
#define v128_16 vmovq_n_u16
@@ -326,10 +329,59 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
}
// how to build a bitmask from vector elements? Efficiently???
#define v128_movmask32
#define v128_movmask64
//#define v128_movmask32
//#define v128_movmask64
#define v128_shuffle8( v, vmask ) \
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
@@ -351,6 +403,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
/* not used
#define v128_ror16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
@@ -368,6 +421,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_rol8( v, c ) \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
((uint8x16_t)(v)), c )
*/
// ( v1 ^ v0 ) >>> c
#if defined(__ARM_FEATURE_SHA3)
@@ -376,57 +430,13 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
#endif
#define v128_2ror64( v1, v0, c ) \
{ \
uint64x2_t t0 = vshrq_n_u64( v0, c ); \
uint64x2_t t1 = vshrq_n_u64( v1, c ); \
v0 = vsliq_n_u64( v0, 64-(c) ); \
v1 = vsliq_n_u64( v1, 64-(c) ); \
v0 = vorrq_u64( v0, t0 ); \
v1 = vorrq_u64( v1, t1 ); \
}
#define v128_2rol64_( v1, v0, c ) \
{ \
uint64x2_t t0 = vshlq_n_u64( v0, c ); \
uint64x2_t t1 = vshlq_n_u64( v1, c ); \
v0 = vsriq_n_u64( v0, 64-(c) ); \
v1 = vsriq_n_u64( v1, 64-(c) ); \
v0 = vorrq_u64( v0, t0 ); \
v1 = vorrq_u64( v1, t1 ); \
}
#define v128_2rorl32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshrq_n_u32( v0, c ); \
uint32x4_t t1 = vshrq_n_u32( v1, c ); \
v0 = vsliq_n_u32( v0, 32-(c) ); \
v1 = vsliq_n_u32( v1, 32-(c) ); \
v0 = vorrq_32( v0, t0 ); \
v1 = vorrq_u32( v1, t1 ); \
}
#define v128_2ror32( v1, v0, c ) \
{ \
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
v0 = vsriq_n_u32( v0, 32-(c) ); \
v1 = vsriq_n_u32( v1, 32-(c) ); \
v0 = vorrq_u32( v0, t0 ); \
v1 = vorrq_u32( v1, t1 ); \
}
/* not used anywhere and hopefully never will
// vector mask, use as last resort. prefer tbl, rev, alignr, etc
#define v128_shufflev32( v, vmask ) \
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
/* not used
// v1 + ( v0 >> c )
#define v128_addsr64( v1, v0, c ) vsraq_n_u64( v1, v0, c )
#define v128_addsr32( v1, v0, c ) vsraq_n_u32( v1, v0, c )
*/
#define v128_shuffle8( v, vmask ) \
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Cross lane shuffle
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -438,19 +448,14 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
#define v128_qrev16 vrev64q_u16
#define v128_lrev16 vrev32q_u16
// aka bswap
// #define v128_qrev8 vrev64q_u8
// #define v128_lrev8 vrev32q_u8
// #define v128_wrev8 vrev16q_u8
// full vector rotation
// reverse elements in vector
static inline uint64x2_t v128_rev64( uint64x2_t v )
{ return vextq_u64( v, v, 1 ); }
#define v128_swap64 v128_rev64 // grandfathered
#define v128_swap64 v128_rev64 // grandfathered
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
// shuffle-rotate vector elements
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
@@ -468,7 +473,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
// Useful for x86_64 but does nothing for ARM
#define v128_block_bswap32( dst, src ) \
{ \
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
@@ -482,26 +486,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
}
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_512( dst, src ) \
{ \
casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \
casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \
casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \
casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \
casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \
casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \
casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \
casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \
casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \
casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \
casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \
casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \
casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \
casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \
casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \
casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \
}
#define v128_block_bswap64( dst, src ) \
{ \
casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \
@@ -513,27 +497,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \
casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \
}
#define v128_block_bswap64_512 v128_block_bswap64 \
#define v128_block_bswap64_1024( dst, src ) \
{ \
casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \
casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \
casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \
casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \
casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \
casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \
casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \
casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \
casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \
casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \
casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \
casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \
casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \
casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \
casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
}
// Bitwise blend using vector mask, use only bytewise for compatibility
// with x86_64.

View File

@@ -173,21 +173,13 @@ static inline int cpu_fanpercent()
return 0;
}
#if defined(__x86_64__)
// x86_64 CPUID
// This list is incomplete, it only contains features of interest to cpuminer.
// refer to http://en.wikipedia.org/wiki/CPUID for details.
// AVX10 compatibility notes
//
// Display format: AVX10.[version]-[vectorwidth]
// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset
// with full 512 bit vector support.
// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
// features applied only to 256 bit and 128 bit vectors.
// Future AVX10 versions will add new instructions and features.
// Register array indexes
#define EAX_Reg (0)
#define EBX_Reg (1)
@@ -209,6 +201,7 @@ static inline int cpu_fanpercent()
// CPU_INFO: EAX=1, ECX=0
// ECX
#define SSE3_Flag 1
#define PCLMULQDQ_Flag (1<< 1)
#define SSSE3_Flag (1<< 9)
#define XOP_Flag (1<<11) // obsolete
#define FMA3_Flag (1<<12)
@@ -239,6 +232,7 @@ static inline int cpu_fanpercent()
#define AVX512_VBMI_Flag (1<< 1)
#define AVX512_VBMI2_Flag (1<< 6)
#define VAES_Flag (1<< 9)
#define VPCLMULQDQ_Flag (1<<10)
#define AVX512_VNNI_Flag (1<<11)
#define AVX512_BITALG_Flag (1<<12)
#define AVX512_VPOPCNTDQ_Flag (1<<14)
@@ -260,6 +254,8 @@ static inline int cpu_fanpercent()
#define AVX512_BF16_Flag (1<< 5)
#define AMX_FP16_Flag (1<<21)
#define AVX_IFMA_Flag (1<<23)
#define MOVRS_Flag (1<<31) // Both names are referenced in docs
#define AVX10_MOVRS_Flag (1<<31)
// EDX
#define AVX_VNNI_INT8_Flag (1<< 4)
#define AVX_NE_CONVERT_Flag (1<< 5)
@@ -271,17 +267,15 @@ static inline int cpu_fanpercent()
// AVX10_FEATURES: EAX=0x24, ECX=0
// EBX
#define AVX10_VERSION_mask 0xff // bits [7:0]
#define AVX10_128_Flag (1<<16)
#define AVX10_256_Flag (1<<17)
#define AVX10_512_Flag (1<<18)
//#define AVX10_128_Flag (1<<16)
//#define AVX10_256_Flag (1<<17)
//#define AVX10_512_Flag (1<<18)
// Use this to detect presence of feature
#define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
#if defined(__x86_64__)
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
{
@@ -317,7 +311,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
#elif defined(ARM_AUXV)
// Always test if HWCAP variable is defined in the kernel before attempting
// to compile it. If not defined the feature can't be tested and won't be
// to compile this. If not defined the feature can't be tested and won't be
// included in the compile.
// This can occur if compiling with an old kernel and a new CPU and could
// result in a suboptimal build.
@@ -543,6 +537,15 @@ static inline bool cpu_arch_aarch64()
#endif
}
static inline bool cpu_arch_riscv64()
{
#if defined(__riscv) && ( __riscv_xlen == 64 )
return true;
#else
return false;
#endif
}
static inline bool has_sse()
{
#if defined(__x86_64__)
@@ -608,6 +611,16 @@ static inline bool has_neon()
#endif
}
// No apparent CPUID equivalent on riscv, returns SW build info.
static inline bool has_rvv()
{
#if defined(__riscv) && defined(__riscv_vector)
return true;
#else
return false;
#endif
}
static inline bool has_avx()
{
#if defined(__x86_64__)
@@ -897,7 +910,6 @@ static inline bool has_apx_f()
#endif
}
// Not much use on it's own
static inline bool has_avx10()
{
#if defined(__x86_64__)
@@ -922,49 +934,6 @@ static inline unsigned int avx10_version()
return 0;
}
// also includes 256 & 128
static inline bool has_avx10_512()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
}
#endif
return false;
}
// Includes 128 but might not include 512
static inline bool has_avx10_256()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
}
#endif
return false;
}
// AVX10 vector register length
static inline unsigned int avx10_vector_length()
{
#if defined(__x86_64__)
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
: ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
}
#endif
return 0;
}
// ARM SVE vector register length, converted from bytes to bits.
static inline int sve_vector_length()
{
@@ -975,6 +944,33 @@ static inline int sve_vector_length()
return 0;
}
// Assume min_vlen refers to the register size
static inline int rvv_vector_length()
{
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}
// generic
static inline int vector_length()
{
#if defined(__x86_64__)
return has_avx10() || has_avx512() ? 512
: has_avx2() ? 256
: has_sse2() ? 128
: 0;
#elif defined(__aarch64__)
return has_sve() ? sve_vector_length()
: has_neon() ? 128
: 0;
#elif defined(__riscv) && defined(__riscv_vector)
return rvv_vector_length();
#endif
return 0;
}
static inline uint32_t cpuid_get_highest_function_number()
{
#if defined(__x86_64__)
@@ -1061,13 +1057,17 @@ static inline void cpu_brand_string( char* s )
memcpy( s + 32, cpu_info, sizeof(cpu_info) );
}
#elif defined(__arm__) || defined(__aarch64__)
#elif defined(__aarch64__)
sprintf( s, "ARM 64 bit CPU" );
#elif defined(__riscv) && (__riscv_xlen == 64)
sprintf( s, "RISC-V 64 bit CPU" );
#else
sprintf( s, "unknown CPU architecture" );
sprintf( s, "unknown/unsupported CPU architecture" );
#endif
}

View File

@@ -16,13 +16,8 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32"
# used by GCC
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs"
# Support for Windows 7 CPU groups, AES sometimes not included in -march
# CPU groups disabled due to incompatibilities between Intel and AMD CPUs.
# CPU groups are enabled by default in Makefile, use -U to disable.
export DEFAULT_CFLAGS="-maes -O3 -Wall -U_WIN32_WINNT"
export DEFAULT_CFLAGS_OLD="-O3 -Wall -U_WIN32_WINNT"
#export DEFAULT_CFLAGS="-maes -O3 -Wall"
#export DEFAULT_CFLAGS_OLD="-O3 -Wall"
export DEFAULT_CFLAGS="-maes -O3 -Wall"
export DEFAULT_CFLAGS_OLD="-O3 -Wall"
# make link to local gmp header file.
ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
@@ -125,12 +120,12 @@ CFLAGS="-msse2 $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean
#make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe
#make clean || echo clean
#rm -f config.status
#CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
#make -j 8
#strip -s cpuminer.exe