Compare commits

...

9 Commits
v24.1 ... v25.2

Author SHA1 Message Date
Jay D Dee
1ed18bf22e v25.2 2025-01-12 18:58:21 -05:00
Jay D Dee
1d9341ee92 v25.1 2024-12-30 21:33:04 -05:00
Jay D Dee
a45a333b40 v24.8 2024-12-25 23:12:29 -05:00
Jay D Dee
2b1037a7c7 v24.7 2024-12-16 19:17:19 -05:00
Jay D Dee
06624a0ff2 v24.6 2024-12-08 11:14:08 -05:00
Jay D Dee
8e91bfbe19 v24.5 2024-09-13 14:14:57 -04:00
Jay D Dee
47e24b50e8 v24.4 2024-07-01 00:33:19 -04:00
Jay D Dee
c47c4a8885 v24.3 2024-05-28 18:20:19 -04:00
Jay D Dee
042d13d1e1 v24.2 2024-05-20 23:08:50 -04:00
181 changed files with 3852 additions and 5589 deletions

View File

@@ -1,19 +1,39 @@
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
if HAVE_APPLE
# MacOS uses Homebrew to install needed packages but they aren't linked for
# the jansson test in configure. Ignore the failed test & link them now,
# different path for different CPU arch.
if ARCH_ARM64
EXTRA_INCLUDES = -I/opt/homebrew/include
EXTRA_LIBS = -L/opt/homebrew/lib
else
JANSSON_INCLUDES=
EXTRA_INCLUDES = -I/usr/local/include
EXTRA_LIBS = -L/usr/local/lib
endif
EXTRA_DIST = example-cfg.json nomacro.pl
else
SUBDIRS = compat
if WANT_JANSSON
# Can't find jansson libraries, compile the included source code.
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
else
EXTRA_INCLUDES =
EXTRA_LIBS =
endif
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
endif
bin_PROGRAMS = cpuminer
EXTRA_DIST = example-cfg.json nomacro.pl
dist_man_MANS = cpuminer.1
SUBDIRS = compat
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
@@ -166,9 +186,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/shavite/shavite.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
@@ -275,29 +292,29 @@ cpuminer_SOURCES = \
algo/yespower/yespower-opt.c \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if USE_ASM
disable_flags =
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if ARCH_ARM64
cpuminer_CFLAGS += -flax-vector-conversions
endif
if HAVE_WINDOWS
# use to profile an object
# gprof_cflags = -pg -g3
# cpuminer_LDFLAGS += -pg
@@ -311,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
endif

View File

@@ -36,34 +36,18 @@ for compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
RPC getwork using http:// or https://.
GBT is YMMV.
RPC getblockte,plate using http:// or https://.
Supported Algorithms
--------------------

View File

@@ -75,6 +75,69 @@ If not what makes it happen or not happen?
Change Log
----------
v25.2
ARM: Fixed regression from v25.1 that could cause build fail.
BSD: FreeBSD is now supported. Other BSDs may also work.
MacOS: build with installed jansson library instead of compiling the included source code.
Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
Changed build.sh shell from bash to sh.
v25.1
MacOS ARM64: m7m algo is now working.
MacOS ARM64: can now be compiled with GCC.
MacOS x86_64: is now working compiled with GCC.
Fixed some minor bugs & removed some obsolete code.
v24.8
ARM: Apple MacOS on M series CPU is now supported compiled from source
code, see Wiki for details.
ARM: Fix incorrect compiler version display when using clang.
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
have been removed.
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
continue to be compiled with CPU groups disabled.
v24.7
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
v24.6
ARM: Fixed scryptn2, x16*, broken in v24.2.
ARM: Small improvement to interleaving.
Eliminated some potential compile errors in code that was dependent on
compiler optimisations.
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
v24.5
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
#427: GBT: Improved handling of new work.
Removed shavite3 algo.
v24.4
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
x86_64: fixed a bug in alignr macros for SSE2.
ARM: CPU feature reporting enhancements.
Some code cleanup.
v24.3
ARM: CPU feature detection and reporting is now working.
ARM: Verthash is now working.
ARM: Small speedup for yescrypt, yespower & argon2d.
Code cleanup.
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.
v24.1
#414: fix bug in merkle error handling.

View File

@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
gate->set_work_data_endian = (void*)&do_nothing;
gate->resync_threads = (void*)&do_nothing;
gate->do_this_thread = (void*)&return_true;
// gate->resync_threads = (void*)&do_nothing;
// gate->do_this_thread = (void*)&return_true;
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
gate->get_work_data_size = (void*)&std_get_work_data_size;
gate->optimizations = EMPTY_SET;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;

View File

@@ -98,9 +98,11 @@ typedef uint32_t set_t;
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
void ( *set_work_data_endian ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int );
//bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( int, struct work* );
//void ( *resync_threads ) ( int, struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/
#if defined(__AVX512F__)
#if defined(SIMD512)
static inline __m512i blamka( __m512i x, __m512i y )
{
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];

View File

@@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(SIMD512)
#if !defined(__AVX2__)

View File

@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////////
//
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//Blake-256 16 way AVX512

View File

@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////
//

View File

@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define B2B8W_G(a, b, c, d, x, y) \
{ \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
}
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
{
__m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
{
size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
}
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen )
{
__m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_8way_compress( ctx, 0 );
blake2b_8x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
}
}
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
casti_m512i( out, 0 ) = ctx->h[0];
casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
{
__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
{
size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
blake2b_4x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];

View File

@@ -1,6 +1,6 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#ifndef BLAKE2B_HASH_4WAY_H__
#define BLAKE2B_HASH_4WAY_H__
#include "simd-utils.h"
#include <stddef.h>
@@ -15,7 +15,7 @@
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_8way_ctx;
} blake2b_8x64_ctx;
int blake2b_8way_init( blake2b_8way_ctx *ctx );
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx;
} blake2b_4x64_ctx;
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY

View File

@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Blake2s-256 16 way

View File

@@ -11,8 +11,8 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#ifndef BLAKE2S_HASH_4WAY_H__
#define BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__) || defined(__ARM_NEON)
@@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;

View File

@@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY

View File

@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}
#define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}
#define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );
// Round 1
// G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );
// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );
// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );
// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);

View File

@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////
//

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY

View File

@@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \

View File

@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 64 bit 8 way
typedef struct

View File

@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1

View File

@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 8 WAY

View File

@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// 4 way 128 is handy to avoid reinterleaving in many algos.
// If reinterleaving is necessary it may be more efficient to use

View File

@@ -6,7 +6,7 @@
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
struct _cube_4way_context
{

View File

@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
int r;
const int rounds = sp->rounds;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
register __m512i x0, x1;

View File

@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
};
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
static void AddXor512(const void *a,const void *b,void *c)
{
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
casti_m512i( b, 0 ) );
#elif defined(__AVX2__)

View File

@@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\

View File

@@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define GROESTL_4WAY_VAES 1
#endif

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

View File

@@ -43,7 +43,7 @@
#define SIZE256 (SIZE_512/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];

View File

@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{

View File

@@ -33,7 +33,7 @@
#define SIZE512 (SIZE_1024/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];

View File

@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
\
/* AddRoundConstant P1024 */\
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* AddRoundConstant P1024 */\
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\

View File

@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
#endif
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define MYRGR_8WAY 1
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY 1

View File

@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi 8 way AVX512
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
@@ -1122,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_BIG \
do { \
@@ -1155,11 +1153,99 @@ do { \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
}
#else
#define INPUT_BIG_sub( db_i ) \
{ \
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_BIG \
{ \
const __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
INPUT_BIG_sub( db ); \
}
#if 0
// dependent on the compiler unrolling the loop
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
tp += 8; \
} \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
b = mm256_xor3( b, a, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
@@ -1501,7 +1588,7 @@ do { /* order is important */ \
sc->h[14] = CE; \
sc->h[15] = CF;
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_8X32 \
{ \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sc->h[6] = c6; \
sc->h[7] = c7;
#define INPUT_2x64_sub( db_i ) \
{ \
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_2x64 \
{ \
const v128u64_t db = *buf; \
const v128u64_t zero = v128_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
INPUT_2x64_sub( db ); \
}
#if 0
// Dependent on the compiler unrolling the loop.
#define INPUT_2x64 \
{ \
v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
tp += 8; \
} \
}
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{ \
a = v128_rol32( a, 13 ); \
c = v128_rol32( c, 3 ); \
b = v128_xor3( a, b, c ); \
b = v128_xor3( c, a, b ); \
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
b = v128_rol32( b, 1 ); \
d = v128_rol32( d, 7 ); \

View File

@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi-512 8x64

View File

@@ -53,7 +53,7 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
#if defined(VL256)
// ( ~( a ^ b ) ) & c
#define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
// Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
#if defined (VL256)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \

View File

@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[32];

View File

@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
(state)->H[15] = h7l; \
} while (0)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define Sb_8W(x0, x1, x2, x3, c) \
{ \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
#if defined(__AVX2__)
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#if defined(VL256)
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void jh256_8x64_init( jh_8x64_context *sc )
{

View File

@@ -55,7 +55,7 @@
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
@@ -12,7 +12,7 @@
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1

View File

@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
#define DO(x) x
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define INPUT_BUF(size) do { \
size_t j; \
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
size_t byte_len, size_t lim )
{
unsigned eb;
union {
__m512i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
size_t j;
size_t m512_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm512_set1_epi64( t );
tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
keccak64_8way_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
size_t j;
size_t m256_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set1_epi64x( t );
tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
keccak64_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );

View File

@@ -4,7 +4,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \

View File

@@ -51,7 +51,7 @@
#define LIMIT_512 128
/*********************************/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
uint32_t buffer[8*4];

View File

@@ -28,8 +28,7 @@
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -48,29 +47,22 @@
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || defined(__SSE2__)
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
}
#else
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___
#endif // LUFFA_FOR_SSE2_H__

View File

@@ -15,7 +15,7 @@
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV2_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
/////////////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define PHI2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define PHI2_4WAY 1

View File

@@ -41,7 +41,7 @@
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords

View File

@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "algo/blake/blake256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "lyra2.h"
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#include "algo/echo/echo-hash-4way.h"
#elif defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"

View File

@@ -27,7 +27,7 @@
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{

View File

@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 24 ); \
b = v128_ror64xor( b, c, 24 ); \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 63 );
b = v128_ror64xor( b, c, 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G( r, i, a, b, c, d ) \
{ \
a = a + b; \
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
union _ovly_512
{

View File

@@ -1,8 +1,6 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
return exp( xt );
}
/*
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
else if ( xt > p6 - 1.e-200 )
return 0.;
}
*/
double swit2_( double wvnmb )
{
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define NIST5_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY 1

View File

@@ -71,8 +71,7 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
@@ -312,7 +311,7 @@ do { \
BUPDATE1_8W( 7, 1 ); \
} while (0)
#if defined(__AVX512VL__)
#if defined(VL256)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ANIME_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY 1

View File

@@ -11,7 +11,6 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
cubehashParam cube;
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
simd512_context simd;
shavite512_2way_context shavite2;
simd_2way_context simd;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
(const BitSequence *)hash0, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
if ( hash1[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
(const BitSequence *)hash1, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
if ( hash2[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
(const BitSequence *)hash2, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
if ( hash3[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
(const BitSequence *)hash3, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUARK_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY 1

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUBIT_4WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY 1

View File

@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );

View File

@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
#endif
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LBRY_16WAY 1
#elif defined(__AVX2__)
#define LBRY_8WAY 1

View File

@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
_mm_xor_si128( v128_ornot( y, x ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
_mm_xor_si128( x, v128_ornot( z, y ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
casti_v128u32( dst, u ) = sc->val[u];
}
#endif
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
#define F8W_3(x, y, z) \
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
_mm256_xor_si256( mm256_ornot( y, x ), z )
#define F8W_4(x, y, z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
#define F8W_5(x, y, z) \
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
_mm256_xor_si256( x, mm256_ornot( z, y ) )
#define RR_8W(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// RIPEMD-160 16 way

View File

@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -46,7 +46,7 @@
#endif
#ifdef __GNUC__
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
#define ASM 0
#else
#define ASM 1

View File

@@ -745,7 +745,7 @@ do{ \
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Tested OK but very slow
// 16 way parallel, requires 16x32 interleaving
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
v128_ovly v;
for ( int l = 0; l < 4; l++ )
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
X[i] = v128_xor( X[i], v.m128 );
X[i] = v128_xor( X[i], v.v128 );
}
xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
y[0].m128 = X0;
y[1].m128 = X1;
y[2].m128 = X2;
y[3].m128 = X3;
y[0].v128 = X0;
y[1].v128 = X1;
y[2].v128 = X2;
y[3].v128 = X3;
z[0].u32[0] = y[0].u32[0];
z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
z[3].u32[1] = y[2].u32[3];
z[3].u32[0] = y[3].u32[3];
B[0] = v128_add32( B[0], z[0].m128 );
B[1] = v128_add32( B[1], z[1].m128 );
B[2] = v128_add32( B[2], z[2].m128 );
B[3] = v128_add32( B[3], z[3].m128 );
B[0] = v128_add32( B[0], z[0].v128 );
B[1] = v128_add32( B[1], z[1].v128 );
B[2] = v128_add32( B[2], z[2].v128 );
B[3] = v128_add32( B[3], z[3].v128 );
#endif
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
za[3].u32[3] = ya[0].u32[3];
zb[3].u32[3] = yb[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
*/
}
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_2BUF;
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
yc[0].m128 = XC[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
yc[1].m128 = XC[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
yc[2].m128 = XC[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
yc[3].m128 = XC[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
yc[0].v128 = XC[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
yc[1].v128 = XC[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
yc[2].v128 = XC[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
yc[3].v128 = XC[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
zb[3].u32[3] = yb[0].u32[3];
zc[3].u32[3] = yc[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XC[0] = zc[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XC[1] = zc[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XC[2] = zc[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XC[0] = zc[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XC[1] = zc[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XC[2] = zc[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
XC[3] = zc[3].v128;
*/
}
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_3BUF;
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
xf = (B[15] ^= C[15]);
#define ROL32( a, c ) ror32( a, c )
#define ROL32( a, c ) rol32( a, c )
#define ADD32( a, b ) ( (a)+(b) )
#define XOR( a, b ) ( (a)^(b) )

View File

@@ -5,7 +5,7 @@
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );

View File

@@ -35,7 +35,7 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
#endif /* HAVE_SHA256_8WAY */
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static inline void sha256_16way_init_state( void *state )
{
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf

View File

@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
sha256_4way_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
}
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
casti_m128i( U, k ) );
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */
@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
}
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// HMAC 16-way AVX512

View File

@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct _hmac_sha256_16way_context
{

View File

@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#if defined(VL256)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
sha256_8way_close( &ctx, dst );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way

View File

@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
#if defined(__x86_64__) && defined(__SHA__)
/* common code used for rounds 12 through 51 */
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
TMP = _mm_alignr_epi8( a, c, 4 ); \
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
b = _mm_add_epi32( b, TMP ); \
b = _mm_sha256msg2_epu32( b, a ); \
m = _mm_shuffle_epi32( m, 0x0e ); \
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
c = _mm_sha256msg1_epu32( c, a );
// r12-15
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
// r16-19
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
// r20-23
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
// r24-27
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
#define sha256_opt_rounds( state_out, input, state_in ) \
{ \
__m128i STATE0, STATE1; \
@@ -547,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
__m128i STATE0, STATE1, MSG, TMP;
// Load initial values
TMP = casti_m128i( istate, 0 );
STATE1 = casti_m128i( istate, 1 );
TMP = casti_v128u32( istate, 0 );
STATE1 = casti_v128u32( istate, 1 );
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
@@ -556,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
// Save current hash
casti_m128i( sstate, 0 ) = STATE0;
casti_m128i( sstate, 1 ) = STATE1;
casti_v128u32( sstate, 0 ) = STATE0;
casti_v128u32( sstate, 1 ) = STATE1;
// Rounds 0 to 3
MSG = casti_m128i( msg, 0 );
MSG = casti_v128u32( msg, 0 );
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
MSG = _mm_add_epi32( MSG, TMP );
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
MSG = _mm_shuffle_epi32( MSG, 0x0E );
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_m128i( ostate, 1 ) = STATE1;
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_v128u32( ostate, 1 ) = STATE1;
}
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
@@ -579,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
STATE0_X = casti_m128i( state_mid_X, 0 );
STATE1_X = casti_m128i( state_mid_X, 1 );
STATE0_Y = casti_m128i( state_mid_Y, 0 );
STATE1_Y = casti_m128i( state_mid_Y, 1 );
STATE0_X = casti_v128u32( state_mid_X, 0 );
STATE1_X = casti_v128u32( state_mid_X, 1 );
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMSG0_X = casti_v128u32( msg_X, 0 );
TMSG0_Y = casti_v128u32( msg_Y, 0 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
// Rounds 4 to 7
TMSG1_X = casti_m128i( msg_X, 1 );
TMSG1_Y = casti_m128i( msg_Y, 1 );
TMSG1_X = casti_v128u32( msg_X, 1 );
TMSG1_Y = casti_v128u32( msg_Y, 1 );
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -616,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
// Rounds 12 to 15
TMSG3_X = casti_m128i( msg_X, 3 );
TMSG3_Y = casti_m128i( msg_Y, 3 );
TMSG3_X = casti_v128u32( msg_X, 3 );
TMSG3_Y = casti_v128u32( msg_Y, 3 );
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -845,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
// Add saved state to new state
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
// Unshuffle & save state
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
}
#endif // SHA
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
#define sha256_neon_rounds( state_out, input, state_in ) \
{ \
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32x4_t TMP0, TMP1, TMP2; \
\
STATE0 = vld1q_u32( state_in ); \
STATE1 = vld1q_u32( state_in+4 ); \
ABEF_SAVE = STATE0; \
CDGH_SAVE = STATE1; \
ABCD_SAVE = STATE0; \
EFGH_SAVE = STATE1; \
\
MSG0 = load_msg( input, 0 ); \
MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
TMP2 = STATE0; \
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
vst1q_u32( state_out , STATE0 ); \
vst1q_u32( state_out+4, STATE1 ); \
}
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
input_Y, state_in_X, state_in_Y ) \
{ \
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vld1q_u32( state_in_Y ); \
STATE1_X = vld1q_u32( state_in_X+4 ); \
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
ABEF_SAVE_X = STATE0_X; \
ABEF_SAVE_Y = STATE0_Y; \
CDGH_SAVE_X = STATE1_X; \
CDGH_SAVE_Y = STATE1_Y; \
ABCD_SAVE_X = STATE0_X; \
ABCD_SAVE_Y = STATE0_Y; \
EFGH_SAVE_X = STATE1_X; \
EFGH_SAVE_Y = STATE1_Y; \
\
MSG0_X = load_msg( input_X, 0 ); \
MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
vst1q_u32( state_out_X , STATE0_X ); \
vst1q_u32( state_out_Y , STATE0_Y ); \
vst1q_u32( state_out_X+4, STATE1_X ); \

View File

@@ -113,7 +113,7 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way x86_64

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -6,7 +6,7 @@
#include "sha256-hash.h"
#include "sph_sha2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256DT_16X32 1
#elif defined(__x86_64__) && defined(__SHA__)
#define SHA256DT_X86_SHA256 1
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
#if defined(SHA256DT_16X32)
gate->scanhash = (void*)&scanhash_sha256dt_16x32;
#elif defined(SHA256DT_X86_SHA256)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;
#elif defined(SHA256DT_NEON_SHA256)
gate->optimizations = SHA_OPT;
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
#elif defined(SHA256DT_8X32)
gate->scanhash = (void*)&scanhash_sha256dt_8x32;

View File

@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
#if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256t_16way;
#elif defined(SHA256T_SHA)
gate->optimizations = SHA_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256t_sha;
#elif defined(SHA256T_NEON_SHA2)
gate->optimizations = NEON_OPT | SHA256_OPT;
gate->scanhash = (void*)&scanhash_sha256t_neon_sha2;
#elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256t_8way;
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha256q_16way;
gate->hash = (void*)&sha256q_16way_hash;
//#elif defined(SHA256T_SHA)
// gate->optimizations = SHA_OPT;
// gate->optimizations = SHA256_OPT;
// gate->scanhash = (void*)&scanhash_sha256q;
// gate->hash = (void*)&sha256q_hash;
#elif defined(SHA256T_8WAY)

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256T_16WAY 1
#elif defined(__SHA__)
#define SHA256T_SHA 1

View File

@@ -71,31 +71,13 @@ static const uint64_t K512[80] =
// SHA-512 implemented using SHA512 CPU extension.
// Experimental. Not tested. Not reviewed. Compile tested only.
// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
// Modelled after noloader sha256 implementation, replacing 4x32 bit
// instructions with equivalent 4x64 bit instructions and increasing rounds
// to 80.
// Needs GCC-13 for compilation.
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
// Modelled after noloader sha256 implementation.
// It's not clear how SHA512 will be supported before AVX10 considering how
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
// until AVX10-256.
#if defined(__AVX512VL__)
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
#else
// Ugly workaround to make it work with AVX2
static const __m256i mask __attribute__ ((aligned (32)))
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
#define mm256_alignr_1x64( v1, v0 ) \
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
#endif
// Needs GCC-14 for compilation.
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
@@ -109,7 +91,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) )
0x0001020304050607 ) );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -123,153 +105,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128 (MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -289,7 +351,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i MSG, TMP;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
@@ -308,141 +370,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -461,8 +572,22 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
/*
#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
uint64x2_t sha512_compile_test( uint64x2_t test )
{
test = vsha512hq_u64( test, test, test );
test = vsha512h2q_u64( test, test, test );
test = vsha512su0q_u64( test, test );
test = vsha512su1q_u64( test, test, test );
return test;
}
#endif
*/
#if defined(SIMD512)
// SHA-512 8 way 64 bit
@@ -664,8 +789,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
@@ -717,7 +841,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
int i;
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z;
#endif
@@ -754,7 +878,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
H = v256_64( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C );
#endif

View File

@@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-512 8 way

View File

@@ -4,7 +4,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1

View File

@@ -34,7 +34,7 @@
#include <string.h>
#include "shabal-hash-4way.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define DECL_STATE16 \
__m512i A0, A1, A2, A3, A4, A5, A6, A7, \
@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
v512_mult_x3( mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
xa0 = mm512_xor3( xa0, xc, \
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_16 do { \
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
v256_mult_x3( mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
xa0 = mm256_xor3( xa0, xc, \
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_8 do { \

View File

@@ -8,7 +8,7 @@
#define SPH_SIZE_shabal512 512
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[16];

View File

@@ -30,8 +30,7 @@ static const uint32_t IV512[] =
#endif
#if defined (__AVX512VL__)
//TODO Enable for AVX10_256
#if defined (VL256)
#define DECL_m256i_count \
const __m256i count = \

View File

@@ -1,7 +1,7 @@
#include "shavite-hash-4way.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
static const uint32_t IV512[] =
{

View File

@@ -1,10 +1,10 @@
#ifndef SHAVITE_HASH_4WAY_H__
#define SHAVITE_HASH_4WAY_H__ 1
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
#if defined(__VAES__) && defined(SIMD512)
typedef struct {
unsigned char buf[128<<2];
uint32_t h[16<<2];

View File

@@ -1,472 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nist.h"
#include "simd_iv.h"
/* #define NO_PRECOMPUTED_IV */
#if defined(__SSE2__) // || defined(__ARM_NEON)
/*
* Increase the counter.
*/
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
#ifdef HAS_64
state->count += databitlen;
#else
uint32_t old_count = state->count_low;
state->count_low += databitlen;
if (state->count_low < old_count)
state->count_high++;
#endif
}
/*
* Initialize the hashState_sd with a given IV.
* If the IV is NULL, initialize with zeros.
*/
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
#ifdef HAS_64
state->count = 0;
#else
state->count_low = 0;
state->count_high = 0;
#endif
// state->buffer = malloc(16*n + 16);
/*
* Align the buffer to a 128 bit boundary.
*/
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
// state->A = malloc((4*n+4)*sizeof(u32));
/*
* Align the buffer to a 128 bit boundary.
*/
// state->A += ((u32*)NULL - state->A)&3;
state->B = state->A+n;
state->C = state->B+n;
state->D = state->C+n;
if (IV)
memcpy(state->A, IV, 4*n*sizeof(u32));
else
memset(state->A, 0, 4*n*sizeof(u32));
// free(state->buffer);
// free(state->A);
return 0;
}
/*
* Initialize the hashState_sd.
*/
int init_sd(hashState_sd *state, int hashbitlen) {
int r;
char *init;
#ifndef NO_PRECOMPUTED_IV
// if (hashbitlen == 224)
// r=InitIV(state, hashbitlen, IV_224);
// else if (hashbitlen == 256)
// r=InitIV(state, hashbitlen, IV_256);
// else if (hashbitlen == 384)
// r=InitIV(state, hashbitlen, IV_384);
// else
if (hashbitlen == 512)
r = InitIV(state, hashbitlen, IV_512);
else
#endif
{
/*
* Nonstandart length: IV is not precomputed.
*/
r=InitIV(state, hashbitlen, NULL);
if (r != 0)
return r;
init = malloc(state->blocksize);
memset(init, 0, state->blocksize);
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
#else
sprintf(init, "SIMD-%i v1.1", hashbitlen);
#endif
SIMD_Compress(state, (unsigned char*) init, 0);
free(init);
}
return r;
}
int update_sd( hashState_sd *state, const BitSequence *data,
DataLength databitlen )
{
unsigned current;
unsigned int bs = state->blocksize;
static int align = -1;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
return 0;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int final_sd( hashState_sd *state, BitSequence *hashval )
{
#ifdef HAS_64
uint64_t l;
int current = state->count & (state->blocksize - 1);
#else
uint32_t l;
int current = state->count_low & (state->blocksize - 1);
#endif
unsigned int i;
BitSequence bs[64];
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
#ifdef HAS_64
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
#else
l = state->count_low;
for ( i=0; i<4; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
l = state->count_high;
for ( i=0; i<4; i++ )
{
state->buffer[4+i] = l & 0xff;
l >>= 8;
}
if ( state->count_high == 0 && state->count_low < 16384 )
isshort = 2;
#endif
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
bs[4*i ] = x&0xff;
x >>= 8;
bs[4*i+1] = x&0xff;
x >>= 8;
bs[4*i+2] = x&0xff;
x >>= 8;
bs[4*i+3] = x&0xff;
}
memcpy( hashval, bs, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
InitIV( state, 512, IV_512 );
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
#endif

View File

@@ -1,64 +0,0 @@
#ifndef __NIST_H__
#define __NIST_H__
/*define data alignment for different C compilers*/
#if defined(__GNUC__)
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
#else
#define DATA_ALIGN(x) __declspec(align(16)) x
#endif
#include "simd-compat.h"
#include "compat/sha3-defs.h"
/*
* NIST API Specific types.
*/
typedef struct {
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
#ifdef HAS_64
uint64_t count;
#else
uint32_t count_low;
uint32_t count_high;
#endif
DATA_ALIGN(uint32_t A[32]);
uint32_t *B;
uint32_t *C;
uint32_t *D;
DATA_ALIGN(unsigned char buffer[128]);
} hashState_sd;
/*
* NIST API
*/
int init_sd(hashState_sd *state, int hashbitlen);
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
int final_sd(hashState_sd *state, BitSequence *hashval);
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
/*
* Internal API
*/
//int SupportedLength(int hashbitlen);
int RequiredAlignment(void);
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
void fft128_natural(fft_t *a, unsigned char *x);
void fft256_natural(fft_t *a, unsigned char *x);
#endif

View File

@@ -1,198 +0,0 @@
#ifndef __SIMD_COMPAT_H__
#define __SIMD_COMPAT_H__
#include <limits.h>
/*
* This file desfines some helper function for cross-platform compatibility.
*/
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
#define GNU_EXT
#endif
/*
* First define some integer types.
*/
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
/*
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
* type, if any, or otherwise use a wider type.
*/
#include <stdint.h>
#include "compat/brg_types.h"
#define C32(x) ((u32)(x))
#define HAS_64 1
#else
/*
* On non-C99 systems, we use "unsigned int" if it is wide enough,
* "unsigned long" otherwise. This supports all "reasonable" architectures.
* We have to be cautious: pre-C99 preprocessors handle constants
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
*/
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
typedef unsigned int u32;
#define C32(x) ((u32)(x ## U))
#else
typedef unsigned long u32;
#define C32(x) ((u32)(x ## UL))
#endif
/*
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
* test whether "unsigned long long" is available; we also know that
* gcc features this type, even if the libc header do not know it.
*/
#if ((ULONG_MAX >> 31) >> 31) >= 3
typedef unsigned long u64;
#define HAS_64 1
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
typedef unsigned long long u64;
#define HAS_64 1
#else
/*
* No 64-bit type...
*/
#endif
#endif
/*
* fft_t should be at least 16 bits wide.
* using short int will require less memory, but int is faster...
*/
typedef int fft_t;
/*
* Implementation note: some processors have specific opcodes to perform
* a rotation. Recent versions of gcc recognize the expression above and
* use the relevant opcodes, when appropriate.
*/
#define T32(x) ((x) & C32(0xFFFFFFFF))
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
/*
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
*/
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
#define MAYBE_INLINE static inline
#elif defined _MSC_VER
#define MAYBE_INLINE __inline
#else
#define MAYBE_INLINE
#endif
/* */
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
#define rdtsc() \
({ \
u32 lo, hi; \
__asm__ __volatile__ ( /* serialize */ \
"xorl %%eax,%%eax \n cpuid" \
::: "%rax", "%rbx", "%rcx", "%rdx"); \
/* We cannot use "=A", since this would use %rax on x86_64 */ \
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
(u64)hi << 32 | lo; \
}) \
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
#define rdtsc __rdtsc
#endif
/*
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
* n-bit boundary.
* It is defined as false on unknown architectures.
*/
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
/*
* Unaligned 32-bit access are not expensive on x86 so we don't care
*/
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
#elif defined __sparcv9 || defined __sparc || defined __arm || \
defined __ia64 || defined __ia64__ || \
defined __itanium__ || defined __M_IA64 || \
defined __powerpc__ || defined __powerpc
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
#else
/*
* Unkonwn architecture: play safe
*/
#define IS_ALIGNED(p,n) 0
#endif
/* checks for endianness */
#if defined (__linux__) || defined (__GLIBC__)
# include <endian.h>
#elif defined (__FreeBSD__)
# include <machine/endian.h>
#elif defined (__OpenBSD__)
# include <sys/endian.h>
#endif
#ifdef __BYTE_ORDER
# if __BYTE_ORDER == __LITTLE_ENDIAN
# define SIMD_LITTLE_ENDIAN
# elif __BYTE_ORDER == __BIG_ENDIAN
# define SIMD_BIG_ENDIAN
# endif
#else
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
# define SIMD_LITTLE_ENDIAN
# endif
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More