mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1ed18bf22e | ||
![]() |
1d9341ee92 | ||
![]() |
a45a333b40 | ||
![]() |
2b1037a7c7 | ||
![]() |
06624a0ff2 | ||
![]() |
8e91bfbe19 | ||
![]() |
47e24b50e8 | ||
![]() |
c47c4a8885 | ||
![]() |
042d13d1e1 |
68
Makefile.am
68
Makefile.am
@@ -1,19 +1,39 @@
|
||||
|
||||
if WANT_JANSSON
|
||||
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
|
||||
if HAVE_APPLE
|
||||
# MacOS uses Homebrew to install needed packages but they aren't linked for
|
||||
# the jansson test in configure. Ignore the failed test & link them now,
|
||||
# different path for different CPU arch.
|
||||
|
||||
if ARCH_ARM64
|
||||
EXTRA_INCLUDES = -I/opt/homebrew/include
|
||||
EXTRA_LIBS = -L/opt/homebrew/lib
|
||||
else
|
||||
JANSSON_INCLUDES=
|
||||
EXTRA_INCLUDES = -I/usr/local/include
|
||||
EXTRA_LIBS = -L/usr/local/lib
|
||||
endif
|
||||
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
else
|
||||
|
||||
SUBDIRS = compat
|
||||
if WANT_JANSSON
|
||||
# Can't find jansson libraries, compile the included source code.
|
||||
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
|
||||
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
|
||||
else
|
||||
EXTRA_INCLUDES =
|
||||
EXTRA_LIBS =
|
||||
endif
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
|
||||
endif
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
SUBDIRS = compat
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
|
||||
cpuminer_SOURCES = \
|
||||
dummy.cpp \
|
||||
@@ -166,9 +186,6 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
algo/skein/sph_skein.c \
|
||||
@@ -275,29 +292,29 @@ cpuminer_SOURCES = \
|
||||
algo/yespower/yespower-opt.c \
|
||||
algo/yespower/yespower-ref.c \
|
||||
algo/yespower/yespower-blake2b-ref.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
if USE_ASM
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
else
|
||||
disable_flags += -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_SOURCES += compat/winansi.c
|
||||
endif
|
||||
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
if USE_ASM
|
||||
disable_flags =
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
else
|
||||
disable_flags += -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_CFLAGS += -Wl,--stack,10485760
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
|
||||
if ARCH_ARM64
|
||||
cpuminer_CFLAGS += -flax-vector-conversions
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
|
||||
# use to profile an object
|
||||
# gprof_cflags = -pg -g3
|
||||
# cpuminer_LDFLAGS += -pg
|
||||
@@ -311,5 +328,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
|
||||
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
|
||||
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
|
||||
|
||||
|
||||
endif
|
||||
|
26
README.md
26
README.md
@@ -36,34 +36,18 @@ for compile instructions.
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
|
||||
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
|
||||
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
|
||||
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
RPC getwork using http:// or https://.
|
||||
GBT is YMMV.
|
||||
RPC getblockte,plate using http:// or https://.
|
||||
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
|
@@ -75,6 +75,69 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v25.2
|
||||
|
||||
ARM: Fixed regression from v25.1 that could cause build fail.
|
||||
BSD: FreeBSD is now supported. Other BSDs may also work.
|
||||
MacOS: build with installed jansson library instead of compiling the included source code.
|
||||
Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11.
|
||||
Changed build.sh shell from bash to sh.
|
||||
|
||||
v25.1
|
||||
|
||||
MacOS ARM64: m7m algo is now working.
|
||||
MacOS ARM64: can now be compiled with GCC.
|
||||
MacOS x86_64: is now working compiled with GCC.
|
||||
Fixed some minor bugs & removed some obsolete code.
|
||||
|
||||
v24.8
|
||||
|
||||
ARM: Apple MacOS on M series CPU is now supported compiled from source
|
||||
code, see Wiki for details.
|
||||
ARM: Fix incorrect compiler version display when using clang.
|
||||
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
|
||||
have been removed.
|
||||
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
|
||||
continue to be compiled with CPU groups disabled.
|
||||
|
||||
v24.7
|
||||
|
||||
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
|
||||
|
||||
v24.6
|
||||
|
||||
ARM: Fixed scryptn2, x16*, broken in v24.2.
|
||||
ARM: Small improvement to interleaving.
|
||||
Eliminated some potential compile errors in code that was dependent on
|
||||
compiler optimisations.
|
||||
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
|
||||
|
||||
v24.5
|
||||
|
||||
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||
#427: GBT: Improved handling of new work.
|
||||
Removed shavite3 algo.
|
||||
|
||||
v24.4
|
||||
|
||||
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
|
||||
x86_64: fixed a bug in alignr macros for SSE2.
|
||||
ARM: CPU feature reporting enhancements.
|
||||
Some code cleanup.
|
||||
|
||||
v24.3
|
||||
|
||||
ARM: CPU feature detection and reporting is now working.
|
||||
ARM: Verthash is now working.
|
||||
ARM: Small speedup for yescrypt, yespower & argon2d.
|
||||
Code cleanup.
|
||||
|
||||
v24.2
|
||||
|
||||
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
|
||||
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
|
||||
ARM NEON: Various code optimisations.
|
||||
|
||||
v24.1
|
||||
|
||||
#414: fix bug in merkle error handling.
|
||||
|
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
gate->set_work_data_endian = (void*)&do_nothing;
|
||||
gate->resync_threads = (void*)&do_nothing;
|
||||
gate->do_this_thread = (void*)&return_true;
|
||||
// gate->resync_threads = (void*)&do_nothing;
|
||||
// gate->do_this_thread = (void*)&return_true;
|
||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
||||
gate->optimizations = EMPTY_SET;
|
||||
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||
|
@@ -98,9 +98,11 @@ typedef uint32_t set_t;
|
||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||
#define NEON_OPT 1 << 11 // AArch64
|
||||
#define AVX10_256 1 << 12
|
||||
#define AVX10_512 1 << 13
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
//bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( int, struct work* );
|
||||
//void ( *resync_threads ) ( int, struct work* );
|
||||
|
||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||
|
||||
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -35,7 +35,7 @@
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static inline __m512i blamka( __m512i x, __m512i y )
|
||||
{
|
||||
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
|
||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint32_t starting_index, i;
|
||||
#if defined(__AVX512F__)
|
||||
#if defined(SIMD512)
|
||||
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
#elif defined(__AVX2__)
|
||||
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
||||
|
@@ -21,7 +21,7 @@
|
||||
#include "blake2-impl.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if !defined(__AVX512F__)
|
||||
#if !defined(SIMD512)
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
|
@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
|
@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
|
||||
#define blake256r8_8x32_update blake256r14_8way_update
|
||||
#define blake256r8_8x32_close blake256r14_8way_close
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
///////////////////////////////////
|
||||
//
|
||||
|
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
blake2b_8x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
|
||||
};
|
||||
*/
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
|
||||
{
|
||||
__m256i v[16], m[16];
|
||||
|
||||
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m256i* in =(__m256i*)input;
|
||||
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_4way_compress( ctx, 0 );
|
||||
blake2b_4x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m256i( out, 0 ) = ctx->h[0];
|
||||
casti_m256i( out, 1 ) = ctx->h[1];
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
#ifndef BLAKE2B_HASH_4WAY_H__
|
||||
#define BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
@@ -15,7 +15,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m512i b[16]; // input buffer
|
||||
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
} blake2b_8x64_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
|
||||
|
||||
#define blake2b_8way_ctx blake2b_8x64_ctx
|
||||
#define blake2b_8way_init blake2b_8x64_init
|
||||
#define blake2b_8way_update blake2b_8x64_update
|
||||
#define blake2b_8way_final blake2b_8x64_final
|
||||
|
||||
#endif
|
||||
|
||||
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx;
|
||||
} blake2b_4x64_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
|
||||
|
||||
#define blake2b_4way_ctx blake2b_4x64_ctx
|
||||
#define blake2b_4way_init blake2b_4x64_init
|
||||
#define blake2b_4way_update blake2b_4x64_update
|
||||
#define blake2b_4way_final blake2b_4x64_final
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
|
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
|
@@ -11,8 +11,8 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
//#pragma once
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
#ifndef BLAKE2S_HASH_4WAY_H__
|
||||
#define BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
@@ -29,20 +29,20 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_nway_param;
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_nway_param;
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||
{
|
||||
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ 32 * 8 ];
|
||||
uint8_t buf[ 64 * 8 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ 32 * 16 ];
|
||||
uint8_t buf[ 64 * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
|
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 25 ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 11 ); \
|
||||
}
|
||||
|
||||
#define BLAKE512_ROUND( R ) \
|
||||
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
////////////////////////////////////
|
||||
//
|
||||
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
|
||||
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 32 ); \
|
||||
d = v128_ror64xor( d, a, 32 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 25 ); \
|
||||
b = v128_ror64xor( b, c, 25 ); \
|
||||
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
||||
d = v128_ror64xor( d, a, 16 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 11 ); \
|
||||
b = v128_ror64xor( b, c, 11 ); \
|
||||
}
|
||||
|
||||
#define ROUND_B_2X64(r) \
|
||||
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
|
||||
// G4 skip nonce
|
||||
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
||||
V0 );
|
||||
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
|
||||
VF = v128_ror64xor( VF, V0, 32 );
|
||||
VA = v128_add64( VA, VF );
|
||||
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
|
||||
V5 = v128_ror64xor( V5, VA, 25 );
|
||||
V0 = v128_add64( V0, V5 );
|
||||
|
||||
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0, with the nonce now available
|
||||
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
||||
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
|
||||
VF = v128_ror64xor( VF, V0, 16 );
|
||||
VA = v128_add64( VA, VF );
|
||||
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
|
||||
V5 = v128_ror64xor( V5, VA, 11 );
|
||||
|
||||
// Round 1
|
||||
// G0
|
||||
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
||||
|
||||
// G1
|
||||
V1 = v128_add64( V1, V5 );
|
||||
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
|
||||
VD = v128_ror64xor( VD, V1, 32 );
|
||||
V9 = v128_add64( V9, VD );
|
||||
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
|
||||
V5 = v128_ror64xor( V5, V9, 25 );
|
||||
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
||||
V5 ) );
|
||||
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
|
||||
VD = v128_ror64xor( VD, V1, 16 );
|
||||
V9 = v128_add64( V9, VD );
|
||||
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
|
||||
V5 = v128_ror64xor( V5, V9, 11 );
|
||||
|
||||
// G2
|
||||
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
||||
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
|
||||
VE = v128_ror64xor( VE, V2, 32 );
|
||||
VA = v128_add64( VA, VE );
|
||||
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
|
||||
V6 = v128_ror64xor( V6, VA, 25 );
|
||||
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
||||
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
|
||||
VE = v128_ror64xor( VE, V2, 16 );
|
||||
VA = v128_add64( VA, VE );
|
||||
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
|
||||
V6 = v128_ror64xor( V6, VA, 11 );
|
||||
|
||||
// G3
|
||||
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
|
||||
VF = v128_ror64xor( VF, V3, 32 );
|
||||
VB = v128_add64( VB, VF );
|
||||
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
|
||||
V7 = v128_ror64xor( V7, VB, 25 );
|
||||
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
||||
V7 ) );
|
||||
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
|
||||
VF = v128_ror64xor( VF, V3, 16 );
|
||||
VB = v128_add64( VB, VF );
|
||||
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
|
||||
V7 = v128_ror64xor( V7, VB, 11 );
|
||||
|
||||
// G4, G5, G6, G7
|
||||
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||
|
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
||||
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
||||
#define blake512_4way_final_le blake512_4x64_final_le
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKECOIN_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
|
@@ -101,15 +101,15 @@
|
||||
{ \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 24 ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
|
@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-512 64 bit 8 way
|
||||
typedef struct
|
||||
|
@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
|
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-512 8 WAY
|
||||
|
||||
|
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||
// If reinterleaving is necessary it may be more efficient to use
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
|
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
register __m512i x0, x1;
|
||||
|
||||
|
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
};
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||
|
@@ -5,7 +5,7 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
||||
|
||||
static void AddXor512(const void *a,const void *b,void *c)
|
||||
{
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||
casti_m512i( b, 0 ) );
|
||||
#elif defined(__AVX2__)
|
||||
|
@@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
|
@@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#define GROESTL_4WAY_VAES 1
|
||||
#endif
|
||||
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
|
||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
|
@@ -43,7 +43,7 @@
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||
|
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
|
@@ -33,7 +33,7 @@
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
|
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
||||
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
||||
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
||||
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
||||
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
||||
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
||||
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
||||
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
||||
|
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#define MYRGR_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||
#define MYRGR_4WAY 1
|
||||
|
@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
#define S1F MF
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
||||
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
||||
// had a 3% faster overall hashrate than than using movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
tb = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
b = mm512_xoror( td, tb, a ); \
|
||||
td = mm512_xorand( a, td, tb ); \
|
||||
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = mm512_xor3( tb, b, td ); \
|
||||
d = mm512_not( td ); \
|
||||
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
@@ -1122,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
|
||||
// Hamsi 4 way AVX2
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
@@ -1155,11 +1153,99 @@ do { \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define INPUT_BIG_sub( db_i ) \
|
||||
{ \
|
||||
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_BIG \
|
||||
{ \
|
||||
const __m256i db = *buf; \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
||||
INPUT_BIG_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// dependent on the compiler unrolling the loop
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
@@ -1180,6 +1266,7 @@ do { \
|
||||
tp += 8; \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
@@ -1219,7 +1306,7 @@ do { \
|
||||
do { \
|
||||
a = mm256_rol_32( a, 13 ); \
|
||||
c = mm256_rol_32( c, 3 ); \
|
||||
b = mm256_xor3( a, b, c ); \
|
||||
b = mm256_xor3( b, a, c ); \
|
||||
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
||||
b = mm256_rol_32( b, 1 ); \
|
||||
d = mm256_rol_32( d, 7 ); \
|
||||
@@ -1501,7 +1588,7 @@ do { /* order is important */ \
|
||||
sc->h[14] = CE; \
|
||||
sc->h[15] = CF;
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define INPUT_8X32 \
|
||||
{ \
|
||||
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7;
|
||||
|
||||
#define INPUT_2x64_sub( db_i ) \
|
||||
{ \
|
||||
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
const v128u64_t db = *buf; \
|
||||
const v128u64_t zero = v128_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
||||
INPUT_2x64_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Dependent on the compiler unrolling the loop.
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128u64_t db = *buf; \
|
||||
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX_2x64( a, b, c, d ) \
|
||||
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{ \
|
||||
a = v128_rol32( a, 13 ); \
|
||||
c = v128_rol32( c, 3 ); \
|
||||
b = v128_xor3( a, b, c ); \
|
||||
b = v128_xor3( c, a, b ); \
|
||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||
b = v128_rol32( b, 1 ); \
|
||||
d = v128_rol32( d, 7 ); \
|
||||
|
@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Hamsi-512 8x64
|
||||
|
||||
|
@@ -53,7 +53,7 @@ extern "C"{
|
||||
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
||||
//#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define v128_andnotxor( a, b, c ) \
|
||||
@@ -583,7 +583,7 @@ do { \
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#if defined (__AVX512VL__)
|
||||
#if defined (VL256)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
@@ -882,7 +882,7 @@ do { \
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm512_andnotxor( a, b, c ) \
|
||||
|
@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[32];
|
||||
|
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
|
||||
(state)->H[15] = h7l; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
{ \
|
||||
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_256, not used with AVX512VL
|
||||
#if defined(VL256)
|
||||
|
||||
#define notxorandnot( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
||||
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
void jh256_8x64_init( jh_8x64_context *sc )
|
||||
{
|
||||
|
@@ -55,7 +55,7 @@
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define KECCAK_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define KECCAK_4WAY 1
|
||||
@@ -12,7 +12,7 @@
|
||||
#define KECCAK_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA3D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA3D_4WAY 1
|
||||
|
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
|
||||
|
||||
#define DO(x) x
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
size_t byte_len, size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m512i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
|
||||
size_t j;
|
||||
size_t m512_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm512_set1_epi64( t );
|
||||
tmp[0] = _mm512_set1_epi64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
||||
keccak64_8way_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m256i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
|
||||
size_t j;
|
||||
size_t m256_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm256_set1_epi64x( t );
|
||||
tmp[0] = _mm256_set1_epi64x( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_core( kc, u.tmp, j, lim );
|
||||
keccak64_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
|
@@ -51,7 +51,7 @@
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
uint32_t buffer[8*4];
|
||||
|
@@ -28,8 +28,7 @@
|
||||
a = v128_xor( a, c0 ); \
|
||||
b = v128_xor( b, c1 ); \
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
@@ -48,29 +47,22 @@
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
#elif defined(__ARM_NEON) || defined(__SSE2__)
|
||||
|
||||
// { a1_0, 0, a1_0, a1_0 }
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
|
||||
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
|
||||
a0 = v128_alignr32( a1, b, 1 ); \
|
||||
a1 = v128_alignr32( b, a1, 1 ); \
|
||||
}
|
||||
|
||||
#else // assume SSE2
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
|
||||
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
}
|
||||
|
||||
#else
|
||||
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
|
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
||||
|
||||
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
const void* data, size_t inlen );
|
||||
#endif // LUFFA_FOR_SSE2_H___
|
||||
#endif // LUFFA_FOR_SSE2_H__
|
||||
|
@@ -15,7 +15,7 @@
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define ALLIUM_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define ALLIUM_8WAY 1
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2REV3_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2REV3_8WAY 1
|
||||
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2REV2_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2REV2_8WAY 1
|
||||
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define PHI2_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define PHI2_4WAY 1
|
||||
|
@@ -41,7 +41,7 @@
|
||||
// lyra2z330, lyra2h,
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||
|
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
|
||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2Z_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY 1
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "lyra2.h"
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#elif defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
@@ -27,7 +27,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
|
@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define G2W_4X64(a,b,c,d) \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a), 32 ); \
|
||||
d = v128_ror64xor( d, a, 32 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 24 ); \
|
||||
b = v128_ror64xor( b, c, 24 ); \
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
||||
d = v128_ror64xor( d, a, 16 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 63 );
|
||||
b = v128_ror64xor( b, c, 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#define G( r, i, a, b, c, d ) \
|
||||
{ \
|
||||
a = a + b; \
|
||||
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
union _ovly_512
|
||||
{
|
||||
|
@@ -1,8 +1,6 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
|
||||
#include <gmp.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
|
||||
return exp( xt );
|
||||
}
|
||||
|
||||
/*
|
||||
static inline double exp_n2( double x1, double x2 )
|
||||
{
|
||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
||||
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
|
||||
else if ( xt > p6 - 1.e-200 )
|
||||
return 0.;
|
||||
}
|
||||
*/
|
||||
|
||||
double swit2_( double wvnmb )
|
||||
{
|
||||
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // not apple
|
||||
|
||||
bool register_m7m_algo( algo_gate_t *gate )
|
||||
{
|
||||
#if defined(__APPLE__)
|
||||
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
||||
return false;
|
||||
#else
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SHA256_OPT;
|
||||
init_m7m_ctx();
|
||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
|
||||
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define NIST5_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define NIST5_4WAY 1
|
||||
|
@@ -71,8 +71,7 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define GAMMA_4W(n0, n1, n2, n4) \
|
||||
(g ## n0 = v128_xor( a ## n0, \
|
||||
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
|
||||
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
|
||||
|
||||
#define PI_ALL_4W do { \
|
||||
a0 = g0; \
|
||||
@@ -312,7 +311,7 @@ do { \
|
||||
BUPDATE1_8W( 7, 1 ); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define ANIME_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define ANIME_4WAY 1
|
||||
|
@@ -11,7 +11,6 @@
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
|
||||
cubehashParam cube;
|
||||
cube_2way_context cube2;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd sd;
|
||||
simd512_context simd;
|
||||
shavite512_2way_context shavite2;
|
||||
simd_2way_context simd;
|
||||
simd_2way_context simd_2way;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)hash0, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||
|
||||
if ( hash1[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
(const BitSequence *)hash1, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash1,
|
||||
(const BitSequence *)hash1, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||
|
||||
if ( hash2[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||
(const BitSequence *)hash2, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash2,
|
||||
(const BitSequence *)hash2, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
|
||||
|
||||
if ( hash3[0] & mask ) //4
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
else
|
||||
{
|
||||
init_sd( &ctx.sd, 512 );
|
||||
update_final_sd( &ctx.sd, (BitSequence *)hash3,
|
||||
(const BitSequence *)hash3, 512 );
|
||||
}
|
||||
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define HMQ1725_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define HMQ1725_4WAY 1
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define QUARK_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define QUARK_4WAY 1
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define QUBIT_4WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define QUBIT_2WAY 1
|
||||
|
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
||||
|
||||
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
// we need bigendian data...
|
||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
||||
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||
intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||
edata, edata, edata, edata, 1024 );
|
||||
|
||||
|
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
||||
|
||||
bool register_lbry_algo( algo_gate_t* gate )
|
||||
{
|
||||
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#if defined (LBRY_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_lbry_16way;
|
||||
gate->hash = (void*)&lbry_16way_hash;
|
||||
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_lbry;
|
||||
gate->hash = (void*)&lbry_hash;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
|
||||
#endif
|
||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LBRY_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LBRY_8WAY 1
|
||||
|
@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
||||
|
||||
#define F3(x, y, z) \
|
||||
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
|
||||
_mm_xor_si128( v128_ornot( y, x ), z )
|
||||
|
||||
#define F4(x, y, z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
||||
|
||||
#define F5(x, y, z) \
|
||||
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
|
||||
_mm_xor_si128( x, v128_ornot( z, y ) )
|
||||
|
||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
||||
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
|
||||
ripemd160_4way_round( sc );
|
||||
for (u = 0; u < 5; u ++)
|
||||
casti_m128i( dst, u ) = sc->val[u];
|
||||
casti_v128u32( dst, u ) = sc->val[u];
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
|
||||
|
||||
#define F8W_3(x, y, z) \
|
||||
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
|
||||
_mm256_xor_si256( mm256_ornot( y, x ), z )
|
||||
|
||||
#define F8W_4(x, y, z) \
|
||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
|
||||
|
||||
#define F8W_5(x, y, z) \
|
||||
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
|
||||
_mm256_xor_si256( x, mm256_ornot( z, y ) )
|
||||
|
||||
#define RR_8W(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// RIPEMD-160 16 way
|
||||
|
||||
|
@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -46,7 +46,7 @@
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
|
||||
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
|
||||
#define ASM 0
|
||||
#else
|
||||
#define ASM 1
|
||||
|
@@ -745,7 +745,7 @@ do{ \
|
||||
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Tested OK but very slow
|
||||
// 16 way parallel, requires 16x32 interleaving
|
||||
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
|
||||
v128_ovly v;
|
||||
for ( int l = 0; l < 4; l++ )
|
||||
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
||||
X[i] = v128_xor( X[i], v.m128 );
|
||||
X[i] = v128_xor( X[i], v.v128 );
|
||||
}
|
||||
|
||||
xor_salsa8_4way( &X[ 0], &X[16] );
|
||||
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
||||
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
|
||||
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
|
||||
|
||||
y[0].m128 = X0;
|
||||
y[1].m128 = X1;
|
||||
y[2].m128 = X2;
|
||||
y[3].m128 = X3;
|
||||
y[0].v128 = X0;
|
||||
y[1].v128 = X1;
|
||||
y[2].v128 = X2;
|
||||
y[3].v128 = X3;
|
||||
|
||||
z[0].u32[0] = y[0].u32[0];
|
||||
z[0].u32[3] = y[1].u32[0];
|
||||
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
||||
z[3].u32[1] = y[2].u32[3];
|
||||
z[3].u32[0] = y[3].u32[3];
|
||||
|
||||
B[0] = v128_add32( B[0], z[0].m128 );
|
||||
B[1] = v128_add32( B[1], z[1].m128 );
|
||||
B[2] = v128_add32( B[2], z[2].m128 );
|
||||
B[3] = v128_add32( B[3], z[3].m128 );
|
||||
B[0] = v128_add32( B[0], z[0].v128 );
|
||||
B[1] = v128_add32( B[1], z[1].v128 );
|
||||
B[2] = v128_add32( B[2], z[2].v128 );
|
||||
B[3] = v128_add32( B[3], z[3].v128 );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
yb[0].m128 = XB[0];
|
||||
ya[1].m128 = XA[1];
|
||||
yb[1].m128 = XB[1];
|
||||
ya[2].m128 = XA[2];
|
||||
yb[2].m128 = XB[2];
|
||||
ya[3].m128 = XA[3];
|
||||
yb[3].m128 = XB[3];
|
||||
ya[0].v128 = XA[0];
|
||||
yb[0].v128 = XB[0];
|
||||
ya[1].v128 = XA[1];
|
||||
yb[1].v128 = XB[1];
|
||||
ya[2].v128 = XA[2];
|
||||
yb[2].v128 = XB[2];
|
||||
ya[3].v128 = XA[3];
|
||||
yb[3].v128 = XB[3];
|
||||
|
||||
za[0].u32[0] = ya[0].u32[0];
|
||||
zb[0].u32[0] = yb[0].u32[0];
|
||||
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||
za[3].u32[3] = ya[0].u32[3];
|
||||
zb[3].u32[3] = yb[0].u32[3];
|
||||
|
||||
XA[0] = za[0].m128;
|
||||
XB[0] = zb[0].m128;
|
||||
XA[1] = za[1].m128;
|
||||
XB[1] = zb[1].m128;
|
||||
XA[2] = za[2].m128;
|
||||
XB[2] = zb[2].m128;
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
XA[0] = za[0].v128;
|
||||
XB[0] = zb[0].v128;
|
||||
XA[1] = za[1].v128;
|
||||
XB[1] = zb[1].v128;
|
||||
XA[2] = za[2].v128;
|
||||
XB[2] = zb[2].v128;
|
||||
XA[3] = za[3].v128;
|
||||
XB[3] = zb[3].v128;
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
|
||||
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
|
||||
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
SALSA_8ROUNDS_SIMD128_2BUF;
|
||||
|
||||
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
/*
|
||||
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
||||
|
||||
ya[0].m128 = XA[0];
|
||||
yb[0].m128 = XB[0];
|
||||
yc[0].m128 = XC[0];
|
||||
ya[1].m128 = XA[1];
|
||||
yb[1].m128 = XB[1];
|
||||
yc[1].m128 = XC[1];
|
||||
ya[2].m128 = XA[2];
|
||||
yb[2].m128 = XB[2];
|
||||
yc[2].m128 = XC[2];
|
||||
ya[3].m128 = XA[3];
|
||||
yb[3].m128 = XB[3];
|
||||
yc[3].m128 = XC[3];
|
||||
ya[0].v128 = XA[0];
|
||||
yb[0].v128 = XB[0];
|
||||
yc[0].v128 = XC[0];
|
||||
ya[1].v128 = XA[1];
|
||||
yb[1].v128 = XB[1];
|
||||
yc[1].v128 = XC[1];
|
||||
ya[2].v128 = XA[2];
|
||||
yb[2].v128 = XB[2];
|
||||
yc[2].v128 = XC[2];
|
||||
ya[3].v128 = XA[3];
|
||||
yb[3].v128 = XB[3];
|
||||
yc[3].v128 = XC[3];
|
||||
|
||||
za[0].u32[0] = ya[0].u32[0];
|
||||
zb[0].u32[0] = yb[0].u32[0];
|
||||
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||
zb[3].u32[3] = yb[0].u32[3];
|
||||
zc[3].u32[3] = yc[0].u32[3];
|
||||
|
||||
XA[0] = za[0].m128;
|
||||
XB[0] = zb[0].m128;
|
||||
XC[0] = zc[0].m128;
|
||||
XA[1] = za[1].m128;
|
||||
XB[1] = zb[1].m128;
|
||||
XC[1] = zc[1].m128;
|
||||
XA[2] = za[2].m128;
|
||||
XB[2] = zb[2].m128;
|
||||
XC[2] = zc[2].m128;
|
||||
XA[3] = za[3].m128;
|
||||
XB[3] = zb[3].m128;
|
||||
XC[3] = zc[3].m128;
|
||||
XA[0] = za[0].v128;
|
||||
XB[0] = zb[0].v128;
|
||||
XC[0] = zc[0].v128;
|
||||
XA[1] = za[1].v128;
|
||||
XB[1] = zb[1].v128;
|
||||
XC[1] = zc[1].v128;
|
||||
XA[2] = za[2].v128;
|
||||
XB[2] = zb[2].v128;
|
||||
XC[2] = zc[2].v128;
|
||||
XA[3] = za[3].v128;
|
||||
XB[3] = zb[3].v128;
|
||||
XC[3] = zc[3].v128;
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
|
||||
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
||||
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
SALSA_8ROUNDS_SIMD128_3BUF;
|
||||
|
||||
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
||||
xf = (B[15] ^= C[15]);
|
||||
|
||||
|
||||
#define ROL32( a, c ) ror32( a, c )
|
||||
#define ROL32( a, c ) rol32( a, c )
|
||||
#define ADD32( a, b ) ( (a)+(b) )
|
||||
#define XOR( a, b ) ( (a)^(b) )
|
||||
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||
|
||||
|
@@ -35,7 +35,7 @@
|
||||
//#include <mm_malloc.h>
|
||||
#include "malloc-huge.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SCRYPT_THROUGHPUT 16
|
||||
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
#define SCRYPT_THROUGHPUT 2
|
||||
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
|
||||
|
||||
#endif /* HAVE_SHA256_8WAY */
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static inline void sha256_16way_init_state( void *state )
|
||||
{
|
||||
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
|
||||
bool register_scrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
|
||||
#else
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#endif
|
||||
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
|
||||
// scrypt_throughput defined at compile time and used to replace
|
||||
// MAX_WAYS to reduce memory usage.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
// scrypt_throughput = 16;
|
||||
if ( opt_param_n > 0x4000 )
|
||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||
|
@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
|
||||
memset( pad, 0x36, 64*4 );
|
||||
|
||||
for ( i = 0; i < Klen; i++ )
|
||||
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
|
||||
casti_m128i( K, i ) );
|
||||
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
|
||||
sha256_4way_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
|
||||
sha256_4way_init( &ctx->octx );
|
||||
memset( pad, 0x5c, 64*4 );
|
||||
for ( i = 0; i < Klen/4; i++ )
|
||||
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
|
||||
casti_m128i( K, i ) );
|
||||
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
sha256_4way_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
||||
|
||||
/* ... xor U_j ... */
|
||||
for ( k = 0; k < 8; k++ )
|
||||
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
|
||||
casti_m128i( U, k ) );
|
||||
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
|
||||
casti_v128u32( U, k ) );
|
||||
}
|
||||
|
||||
/* Copy as many bytes as necessary into buf. */
|
||||
@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// HMAC 16-way AVX512
|
||||
|
||||
|
@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct _hmac_sha256_16way_context
|
||||
{
|
||||
|
@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
|
||||
// to avoid recalculating it as Y^Z. This optimization is not applicable
|
||||
// when MAJ is optimized with ternary logic.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
G = _mm256_load_si256( state_in + 6 );
|
||||
H = _mm256_load_si256( state_in + 7 );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
G = _mm256_load_si256( state_mid + 6 );
|
||||
H = _mm256_load_si256( state_mid + 7 );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
|
||||
#endif
|
||||
|
||||
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i IV7 = H;
|
||||
const __m256i IV6 = G;
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
sha256_8way_close( &ctx, dst );
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
|
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
|
||||
|
||||
#if defined(__x86_64__) && defined(__SHA__)
|
||||
|
||||
|
||||
/* common code used for rounds 12 through 51 */
|
||||
|
||||
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
|
||||
TMP = _mm_alignr_epi8( a, c, 4 ); \
|
||||
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
|
||||
b = _mm_add_epi32( b, TMP ); \
|
||||
b = _mm_sha256msg2_epu32( b, a ); \
|
||||
m = _mm_shuffle_epi32( m, 0x0e ); \
|
||||
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
|
||||
c = _mm_sha256msg1_epu32( c, a );
|
||||
|
||||
// r12-15
|
||||
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
|
||||
// r16-19
|
||||
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
|
||||
// r20-23
|
||||
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
|
||||
// r24-27
|
||||
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
|
||||
|
||||
|
||||
#define sha256_opt_rounds( state_out, input, state_in ) \
|
||||
{ \
|
||||
__m128i STATE0, STATE1; \
|
||||
@@ -547,8 +569,8 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
|
||||
__m128i STATE0, STATE1, MSG, TMP;
|
||||
|
||||
// Load initial values
|
||||
TMP = casti_m128i( istate, 0 );
|
||||
STATE1 = casti_m128i( istate, 1 );
|
||||
TMP = casti_v128u32( istate, 0 );
|
||||
STATE1 = casti_v128u32( istate, 1 );
|
||||
|
||||
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
|
||||
@@ -556,17 +578,17 @@ void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
|
||||
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
|
||||
|
||||
// Save current hash
|
||||
casti_m128i( sstate, 0 ) = STATE0;
|
||||
casti_m128i( sstate, 1 ) = STATE1;
|
||||
casti_v128u32( sstate, 0 ) = STATE0;
|
||||
casti_v128u32( sstate, 1 ) = STATE1;
|
||||
|
||||
// Rounds 0 to 3
|
||||
MSG = casti_m128i( msg, 0 );
|
||||
MSG = casti_v128u32( msg, 0 );
|
||||
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
|
||||
MSG = _mm_add_epi32( MSG, TMP );
|
||||
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
|
||||
MSG = _mm_shuffle_epi32( MSG, 0x0E );
|
||||
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
|
||||
casti_m128i( ostate, 1 ) = STATE1;
|
||||
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
|
||||
casti_v128u32( ostate, 1 ) = STATE1;
|
||||
}
|
||||
|
||||
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
||||
@@ -579,22 +601,22 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
|
||||
STATE0_X = casti_m128i( state_mid_X, 0 );
|
||||
STATE1_X = casti_m128i( state_mid_X, 1 );
|
||||
STATE0_Y = casti_m128i( state_mid_Y, 0 );
|
||||
STATE1_Y = casti_m128i( state_mid_Y, 1 );
|
||||
STATE0_X = casti_v128u32( state_mid_X, 0 );
|
||||
STATE1_X = casti_v128u32( state_mid_X, 1 );
|
||||
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
|
||||
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
|
||||
|
||||
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
|
||||
TMSG0_X = casti_m128i( msg_X, 0 );
|
||||
TMSG0_Y = casti_m128i( msg_Y, 0 );
|
||||
TMSG0_X = casti_v128u32( msg_X, 0 );
|
||||
TMSG0_Y = casti_v128u32( msg_Y, 0 );
|
||||
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
|
||||
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
|
||||
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
|
||||
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
|
||||
|
||||
// Rounds 4 to 7
|
||||
TMSG1_X = casti_m128i( msg_X, 1 );
|
||||
TMSG1_Y = casti_m128i( msg_Y, 1 );
|
||||
TMSG1_X = casti_v128u32( msg_X, 1 );
|
||||
TMSG1_Y = casti_v128u32( msg_Y, 1 );
|
||||
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
|
||||
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
|
||||
@@ -616,8 +638,8 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
||||
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
|
||||
|
||||
// Rounds 12 to 15
|
||||
TMSG3_X = casti_m128i( msg_X, 3 );
|
||||
TMSG3_Y = casti_m128i( msg_Y, 3 );
|
||||
TMSG3_X = casti_v128u32( msg_X, 3 );
|
||||
TMSG3_Y = casti_v128u32( msg_Y, 3 );
|
||||
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
|
||||
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
|
||||
@@ -845,20 +867,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
||||
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
|
||||
|
||||
// Add saved state to new state
|
||||
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
|
||||
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
|
||||
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
|
||||
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
|
||||
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
|
||||
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
|
||||
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
|
||||
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
|
||||
|
||||
// Unshuffle & save state
|
||||
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
|
||||
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
|
||||
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
|
||||
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
|
||||
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
|
||||
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
|
||||
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
|
||||
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
|
||||
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
|
||||
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
|
||||
}
|
||||
|
||||
#endif // SHA
|
||||
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
|
||||
|
||||
#define sha256_neon_rounds( state_out, input, state_in ) \
|
||||
{ \
|
||||
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
|
||||
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
|
||||
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
|
||||
uint32x4_t TMP0, TMP1, TMP2; \
|
||||
\
|
||||
STATE0 = vld1q_u32( state_in ); \
|
||||
STATE1 = vld1q_u32( state_in+4 ); \
|
||||
ABEF_SAVE = STATE0; \
|
||||
CDGH_SAVE = STATE1; \
|
||||
ABCD_SAVE = STATE0; \
|
||||
EFGH_SAVE = STATE1; \
|
||||
\
|
||||
MSG0 = load_msg( input, 0 ); \
|
||||
MSG1 = load_msg( input, 1 ); \
|
||||
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
|
||||
TMP2 = STATE0; \
|
||||
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
|
||||
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
|
||||
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
|
||||
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
|
||||
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
|
||||
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
|
||||
vst1q_u32( state_out , STATE0 ); \
|
||||
vst1q_u32( state_out+4, STATE1 ); \
|
||||
}
|
||||
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
||||
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
|
||||
input_Y, state_in_X, state_in_Y ) \
|
||||
{ \
|
||||
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
|
||||
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
|
||||
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
|
||||
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
|
||||
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
|
||||
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
|
||||
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
|
||||
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
||||
STATE0_Y = vld1q_u32( state_in_Y ); \
|
||||
STATE1_X = vld1q_u32( state_in_X+4 ); \
|
||||
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
|
||||
ABEF_SAVE_X = STATE0_X; \
|
||||
ABEF_SAVE_Y = STATE0_Y; \
|
||||
CDGH_SAVE_X = STATE1_X; \
|
||||
CDGH_SAVE_Y = STATE1_Y; \
|
||||
ABCD_SAVE_X = STATE0_X; \
|
||||
ABCD_SAVE_Y = STATE0_Y; \
|
||||
EFGH_SAVE_X = STATE1_X; \
|
||||
EFGH_SAVE_Y = STATE1_Y; \
|
||||
\
|
||||
MSG0_X = load_msg( input_X, 0 ); \
|
||||
MSG0_Y = load_msg( input_Y, 0 ); \
|
||||
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
||||
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
|
||||
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
|
||||
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
|
||||
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
|
||||
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
|
||||
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
|
||||
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
|
||||
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
|
||||
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
|
||||
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
|
||||
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
|
||||
vst1q_u32( state_out_X , STATE0_X ); \
|
||||
vst1q_u32( state_out_Y , STATE0_Y ); \
|
||||
vst1q_u32( state_out_X+4, STATE1_X ); \
|
||||
|
@@ -113,7 +113,7 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// SHA-256 16 way x86_64
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA256D_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256D_SHA 1
|
||||
|
@@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len )
|
||||
}
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_sha;
|
||||
#elif defined(SHA256D_NEON_SHA2)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
|
||||
#elif defined(SHA256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA256D_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256D_SHA 1
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include "sha256-hash.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA256DT_16X32 1
|
||||
#elif defined(__x86_64__) && defined(__SHA__)
|
||||
#define SHA256DT_X86_SHA256 1
|
||||
@@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256DT_16X32)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_16x32;
|
||||
#elif defined(SHA256DT_X86_SHA256)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;
|
||||
#elif defined(SHA256DT_NEON_SHA256)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
|
||||
#elif defined(SHA256DT_8X32)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_8x32;
|
||||
|
@@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_sha;
|
||||
#elif defined(SHA256T_NEON_SHA2)
|
||||
gate->optimizations = NEON_OPT | SHA256_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_neon_sha2;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
@@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_sha256q_16way;
|
||||
gate->hash = (void*)&sha256q_16way_hash;
|
||||
//#elif defined(SHA256T_SHA)
|
||||
// gate->optimizations = SHA_OPT;
|
||||
// gate->optimizations = SHA256_OPT;
|
||||
// gate->scanhash = (void*)&scanhash_sha256q;
|
||||
// gate->hash = (void*)&sha256q_hash;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA256T_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256T_SHA 1
|
||||
|
@@ -71,31 +71,13 @@ static const uint64_t K512[80] =
|
||||
|
||||
// SHA-512 implemented using SHA512 CPU extension.
|
||||
|
||||
// Experimental. Not tested. Not reviewed. Compile tested only.
|
||||
// Experimental. Not supported. Not tested. Not reviewed. Compile tested only.
|
||||
// Modelled after noloader sha256 implementation, replacing 4x32 bit
|
||||
// instructions with equivalent 4x64 bit instructions and increasing rounds
|
||||
// to 80.
|
||||
|
||||
// Needs GCC-13 for compilation.
|
||||
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
|
||||
// Modelled after noloader sha256 implementation.
|
||||
|
||||
// It's not clear how SHA512 will be supported before AVX10 considering how
|
||||
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
|
||||
// until AVX10-256.
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
|
||||
|
||||
#else
|
||||
// Ugly workaround to make it work with AVX2
|
||||
|
||||
static const __m256i mask __attribute__ ((aligned (32)))
|
||||
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
|
||||
|
||||
#define mm256_alignr_1x64( v1, v0 ) \
|
||||
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
|
||||
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
|
||||
|
||||
#endif
|
||||
// Needs GCC-14 for compilation.
|
||||
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
|
||||
|
||||
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
@@ -109,7 +91,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 ) )
|
||||
0x0001020304050607 ) );
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
@@ -123,153 +105,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128 (MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 64-67
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 68-71
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 72-75
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 76-79
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
@@ -289,7 +351,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i MSG, TMP;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
@@ -308,141 +370,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
|
||||
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
|
||||
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
|
||||
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
|
||||
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
|
||||
|
||||
// Rounds 64-67
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
|
||||
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
|
||||
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
|
||||
|
||||
// Rounds 68-71
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
|
||||
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
|
||||
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 72-75
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
|
||||
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
|
||||
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Rounds 76-79
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
|
||||
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
|
||||
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
|
||||
|
||||
// Add initial state
|
||||
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
|
||||
@@ -461,8 +572,22 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
uint64x2_t sha512_compile_test( uint64x2_t test )
|
||||
{
|
||||
test = vsha512hq_u64( test, test, test );
|
||||
test = vsha512h2q_u64( test, test, test );
|
||||
test = vsha512su0q_u64( test, test );
|
||||
test = vsha512su1q_u64( test, test, test );
|
||||
return test;
|
||||
}
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#if defined(SIMD512)
|
||||
|
||||
// SHA-512 8 way 64 bit
|
||||
|
||||
@@ -664,8 +789,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
|
||||
mm256_ror_64( x, 61 ), \
|
||||
_mm256_srli_epi64( x, 6 ) )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(VL256)
|
||||
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
|
||||
// becomes available.
|
||||
|
||||
@@ -717,7 +841,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
|
||||
int i;
|
||||
register __m256i A, B, C, D, E, F, G, H;
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
// Disable for AVX10_256
|
||||
__m256i X_xor_Y, Y_xor_Z;
|
||||
#endif
|
||||
@@ -754,7 +878,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
|
||||
H = v256_64( 0x5BE0CD19137E2179 );
|
||||
}
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
#if !defined(VL256)
|
||||
// Disable for AVX10_256
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
@@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define SHA512256D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA512256D_4WAY 1
|
||||
|
@@ -34,7 +34,7 @@
|
||||
#include <string.h>
|
||||
#include "shabal-hash-4way.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define DECL_STATE16 \
|
||||
__m512i A0, A1, A2, A3, A4, A5, A6, A7, \
|
||||
@@ -300,11 +300,12 @@ static inline __m512i v512_mult_x5( const __m512i x )
|
||||
|
||||
#define PERM_ELT16( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
|
||||
do { \
|
||||
xa0 = mm512_xor3( xm, xb1, mm512_xorandnot( \
|
||||
v512_mult_x3( mm512_xor3( xa0, xc, \
|
||||
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ) ), \
|
||||
xb3, xb2 ) ); \
|
||||
xb0 = mm512_xnor( xa0, mm512_rol_32( xb0, 1 ) ); \
|
||||
xa0 = mm512_xor3( xa0, xc, \
|
||||
v512_mult_x5( mm512_rol_32( xa1, 15 ) ) ); \
|
||||
xb0 = mm512_rol_32( xb0, 1 ); \
|
||||
xa0 = mm512_xor3( xm, xb1, \
|
||||
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
|
||||
xb0 = mm512_xnor( xa0, xb0 ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0_16 do { \
|
||||
@@ -905,11 +906,12 @@ static inline __m256i v256_mult_x5( const __m256i x )
|
||||
|
||||
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
|
||||
do { \
|
||||
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
|
||||
v256_mult_x3( mm256_xor3( xa0, xc, \
|
||||
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ) ), \
|
||||
xb3, xb2 ) ); \
|
||||
xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
|
||||
xa0 = mm256_xor3( xa0, xc, \
|
||||
v256_mult_x5( mm256_rol_32( xa1, 15 ) ) ); \
|
||||
xb0 = mm256_rol_32( xb0, 1 ); \
|
||||
xa0 = mm256_xor3( xm, xb1, \
|
||||
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
|
||||
xb0 = mm256_xnor( xa0, xb0 ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0_8 do { \
|
||||
|
@@ -8,7 +8,7 @@
|
||||
|
||||
#define SPH_SIZE_shabal512 512
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
|
@@ -30,8 +30,7 @@ static const uint32_t IV512[] =
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined (VL256)
|
||||
|
||||
#define DECL_m256i_count \
|
||||
const __m256i count = \
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "shavite-hash-4way.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
|
||||
static const uint32_t IV512[] =
|
||||
{
|
||||
|
@@ -1,10 +1,10 @@
|
||||
#ifndef SHAVITE_HASH_4WAY_H__
|
||||
#define SHAVITE_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128<<2];
|
||||
uint32_t h[16<<2];
|
||||
|
472
algo/simd/nist.c
472
algo/simd/nist.c
@@ -1,472 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "nist.h"
|
||||
#include "simd_iv.h"
|
||||
|
||||
|
||||
/* #define NO_PRECOMPUTED_IV */
|
||||
#if defined(__SSE2__) // || defined(__ARM_NEON)
|
||||
|
||||
/*
|
||||
* Increase the counter.
|
||||
*/
|
||||
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
|
||||
#ifdef HAS_64
|
||||
state->count += databitlen;
|
||||
#else
|
||||
uint32_t old_count = state->count_low;
|
||||
state->count_low += databitlen;
|
||||
if (state->count_low < old_count)
|
||||
state->count_high++;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize the hashState_sd with a given IV.
|
||||
* If the IV is NULL, initialize with zeros.
|
||||
*/
|
||||
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
|
||||
|
||||
int n = 8;
|
||||
|
||||
state->hashbitlen = hashbitlen;
|
||||
state->n_feistels = n;
|
||||
state->blocksize = 128*8;
|
||||
|
||||
#ifdef HAS_64
|
||||
state->count = 0;
|
||||
#else
|
||||
state->count_low = 0;
|
||||
state->count_high = 0;
|
||||
#endif
|
||||
|
||||
// state->buffer = malloc(16*n + 16);
|
||||
/*
|
||||
* Align the buffer to a 128 bit boundary.
|
||||
*/
|
||||
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
|
||||
|
||||
// state->A = malloc((4*n+4)*sizeof(u32));
|
||||
/*
|
||||
* Align the buffer to a 128 bit boundary.
|
||||
*/
|
||||
// state->A += ((u32*)NULL - state->A)&3;
|
||||
|
||||
state->B = state->A+n;
|
||||
state->C = state->B+n;
|
||||
state->D = state->C+n;
|
||||
|
||||
if (IV)
|
||||
memcpy(state->A, IV, 4*n*sizeof(u32));
|
||||
else
|
||||
memset(state->A, 0, 4*n*sizeof(u32));
|
||||
|
||||
// free(state->buffer);
|
||||
// free(state->A);
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the hashState_sd.
|
||||
*/
|
||||
int init_sd(hashState_sd *state, int hashbitlen) {
|
||||
int r;
|
||||
char *init;
|
||||
|
||||
#ifndef NO_PRECOMPUTED_IV
|
||||
// if (hashbitlen == 224)
|
||||
// r=InitIV(state, hashbitlen, IV_224);
|
||||
// else if (hashbitlen == 256)
|
||||
// r=InitIV(state, hashbitlen, IV_256);
|
||||
// else if (hashbitlen == 384)
|
||||
// r=InitIV(state, hashbitlen, IV_384);
|
||||
// else
|
||||
if (hashbitlen == 512)
|
||||
r = InitIV(state, hashbitlen, IV_512);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/*
|
||||
* Nonstandart length: IV is not precomputed.
|
||||
*/
|
||||
r=InitIV(state, hashbitlen, NULL);
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
init = malloc(state->blocksize);
|
||||
memset(init, 0, state->blocksize);
|
||||
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
|
||||
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
|
||||
#else
|
||||
sprintf(init, "SIMD-%i v1.1", hashbitlen);
|
||||
#endif
|
||||
SIMD_Compress(state, (unsigned char*) init, 0);
|
||||
free(init);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
int update_sd( hashState_sd *state, const BitSequence *data,
|
||||
DataLength databitlen )
|
||||
{
|
||||
unsigned current;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int final_sd( hashState_sd *state, BitSequence *hashval )
|
||||
{
|
||||
#ifdef HAS_64
|
||||
uint64_t l;
|
||||
int current = state->count & (state->blocksize - 1);
|
||||
#else
|
||||
uint32_t l;
|
||||
int current = state->count_low & (state->blocksize - 1);
|
||||
#endif
|
||||
unsigned int i;
|
||||
BitSequence bs[64];
|
||||
int isshort = 1;
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
#ifdef HAS_64
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
#else
|
||||
l = state->count_low;
|
||||
for ( i=0; i<4; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
l = state->count_high;
|
||||
for ( i=0; i<4; i++ )
|
||||
{
|
||||
state->buffer[4+i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count_high == 0 && state->count_low < 16384 )
|
||||
isshort = 2;
|
||||
#endif
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
bs[4*i ] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+1] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+2] = x&0xff;
|
||||
x >>= 8;
|
||||
bs[4*i+3] = x&0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, bs, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int update_final_sd( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
int current, i;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
BitSequence out[64];
|
||||
int isshort = 1;
|
||||
uint64_t l;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current = state->count & (state->blocksize - 1);
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
out[4*i ] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+1] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+2] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+3] = x & 0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, out, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int simd_full( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
|
||||
|
||||
InitIV( state, 512, IV_512 );
|
||||
|
||||
int current, i;
|
||||
unsigned int bs = state->blocksize;
|
||||
static int align = -1;
|
||||
BitSequence out[64];
|
||||
int isshort = 1;
|
||||
uint64_t l;
|
||||
|
||||
if (align == -1)
|
||||
align = RequiredAlignment();
|
||||
|
||||
#ifdef HAS_64
|
||||
current = state->count & (bs - 1);
|
||||
#else
|
||||
current = state->count_low & (bs - 1);
|
||||
#endif
|
||||
|
||||
if ( current & 7 )
|
||||
{
|
||||
// The number of hashed bits is not a multiple of 8.
|
||||
// Very painfull to implement and not required by the NIST API.
|
||||
return 1;
|
||||
}
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_Compress(state, data, 0);
|
||||
databitlen -= bs;
|
||||
data += bs/8;
|
||||
IncreaseCounter(state, bs);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy a chunk of data to the buffer
|
||||
unsigned int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
|
||||
IncreaseCounter( state, databitlen );
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer+current/8, data, len/8 );
|
||||
IncreaseCounter( state,len );
|
||||
databitlen -= len;
|
||||
data += len/8;
|
||||
current = 0;
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current = state->count & (state->blocksize - 1);
|
||||
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
// We first need to zero out the end of the buffer.
|
||||
if ( current & 7 )
|
||||
{
|
||||
BitSequence mask = 0xff >> ( current & 7 );
|
||||
state->buffer[current/8] &= ~mask;
|
||||
}
|
||||
current = ( current+7 ) / 8;
|
||||
memset( state->buffer+current, 0, state->blocksize/8 - current );
|
||||
SIMD_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
//* Input the message length as the last block
|
||||
memset( state->buffer, 0, state->blocksize / 8 );
|
||||
l = state->count;
|
||||
for ( i=0; i<8; i++ )
|
||||
{
|
||||
state->buffer[i] = l & 0xff;
|
||||
l >>= 8;
|
||||
}
|
||||
if ( state->count < 16384 )
|
||||
isshort = 2;
|
||||
|
||||
SIMD_Compress( state, state->buffer, isshort );
|
||||
|
||||
// Decode the 32-bit words into a BitSequence
|
||||
for ( i=0; i < 2*state->n_feistels; i++ )
|
||||
{
|
||||
u32 x = state->A[i];
|
||||
out[4*i ] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+1] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+2] = x & 0xff;
|
||||
x >>= 8;
|
||||
out[4*i+3] = x & 0xff;
|
||||
}
|
||||
|
||||
memcpy( hashval, out, state->hashbitlen / 8 );
|
||||
if ( state->hashbitlen % 8 )
|
||||
{
|
||||
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
|
||||
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,64 +0,0 @@
|
||||
#ifndef __NIST_H__
|
||||
#define __NIST_H__
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN(x) __declspec(align(16)) x
|
||||
#endif
|
||||
|
||||
#include "simd-compat.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
/*
|
||||
* NIST API Specific types.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
#ifdef HAS_64
|
||||
uint64_t count;
|
||||
#else
|
||||
uint32_t count_low;
|
||||
uint32_t count_high;
|
||||
#endif
|
||||
|
||||
DATA_ALIGN(uint32_t A[32]);
|
||||
uint32_t *B;
|
||||
uint32_t *C;
|
||||
uint32_t *D;
|
||||
DATA_ALIGN(unsigned char buffer[128]);
|
||||
|
||||
} hashState_sd;
|
||||
|
||||
/*
|
||||
* NIST API
|
||||
*/
|
||||
|
||||
int init_sd(hashState_sd *state, int hashbitlen);
|
||||
|
||||
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
|
||||
|
||||
int final_sd(hashState_sd *state, BitSequence *hashval);
|
||||
|
||||
int update_final_sd( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
|
||||
int simd_full( hashState_sd *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
|
||||
/*
|
||||
* Internal API
|
||||
*/
|
||||
|
||||
//int SupportedLength(int hashbitlen);
|
||||
int RequiredAlignment(void);
|
||||
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
|
||||
|
||||
void fft128_natural(fft_t *a, unsigned char *x);
|
||||
void fft256_natural(fft_t *a, unsigned char *x);
|
||||
|
||||
#endif
|
@@ -1,198 +0,0 @@
|
||||
#ifndef __SIMD_COMPAT_H__
|
||||
#define __SIMD_COMPAT_H__
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
|
||||
/*
|
||||
* This file desfines some helper function for cross-platform compatibility.
|
||||
*/
|
||||
|
||||
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
|
||||
#define GNU_EXT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* First define some integer types.
|
||||
*/
|
||||
|
||||
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
|
||||
|
||||
/*
|
||||
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
|
||||
* type, if any, or otherwise use a wider type.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
#define C32(x) ((u32)(x))
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* On non-C99 systems, we use "unsigned int" if it is wide enough,
|
||||
* "unsigned long" otherwise. This supports all "reasonable" architectures.
|
||||
* We have to be cautious: pre-C99 preprocessors handle constants
|
||||
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
|
||||
*/
|
||||
|
||||
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
|
||||
|
||||
typedef unsigned int u32;
|
||||
|
||||
#define C32(x) ((u32)(x ## U))
|
||||
|
||||
#else
|
||||
|
||||
typedef unsigned long u32;
|
||||
|
||||
#define C32(x) ((u32)(x ## UL))
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
|
||||
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
|
||||
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
|
||||
* test whether "unsigned long long" is available; we also know that
|
||||
* gcc features this type, even if the libc header do not know it.
|
||||
*/
|
||||
|
||||
#if ((ULONG_MAX >> 31) >> 31) >= 3
|
||||
|
||||
typedef unsigned long u64;
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
|
||||
|
||||
typedef unsigned long long u64;
|
||||
|
||||
#define HAS_64 1
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* No 64-bit type...
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* fft_t should be at least 16 bits wide.
|
||||
* using short int will require less memory, but int is faster...
|
||||
*/
|
||||
|
||||
typedef int fft_t;
|
||||
|
||||
|
||||
/*
|
||||
* Implementation note: some processors have specific opcodes to perform
|
||||
* a rotation. Recent versions of gcc recognize the expression above and
|
||||
* use the relevant opcodes, when appropriate.
|
||||
*/
|
||||
|
||||
#define T32(x) ((x) & C32(0xFFFFFFFF))
|
||||
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
|
||||
*/
|
||||
|
||||
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
|
||||
#define MAYBE_INLINE static inline
|
||||
#elif defined _MSC_VER
|
||||
#define MAYBE_INLINE __inline
|
||||
#else
|
||||
#define MAYBE_INLINE
|
||||
#endif
|
||||
|
||||
|
||||
/* */
|
||||
|
||||
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
|
||||
|
||||
#define rdtsc() \
|
||||
({ \
|
||||
u32 lo, hi; \
|
||||
__asm__ __volatile__ ( /* serialize */ \
|
||||
"xorl %%eax,%%eax \n cpuid" \
|
||||
::: "%rax", "%rbx", "%rcx", "%rdx"); \
|
||||
/* We cannot use "=A", since this would use %rax on x86_64 */ \
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
|
||||
(u64)hi << 32 | lo; \
|
||||
}) \
|
||||
|
||||
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
|
||||
|
||||
#define rdtsc __rdtsc
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
|
||||
* n-bit boundary.
|
||||
* It is defined as false on unknown architectures.
|
||||
*/
|
||||
|
||||
|
||||
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
|
||||
|
||||
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
|
||||
/*
|
||||
* Unaligned 32-bit access are not expensive on x86 so we don't care
|
||||
*/
|
||||
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
|
||||
|
||||
#elif defined __sparcv9 || defined __sparc || defined __arm || \
|
||||
defined __ia64 || defined __ia64__ || \
|
||||
defined __itanium__ || defined __M_IA64 || \
|
||||
defined __powerpc__ || defined __powerpc
|
||||
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
|
||||
|
||||
#else
|
||||
/*
|
||||
* Unkonwn architecture: play safe
|
||||
*/
|
||||
#define IS_ALIGNED(p,n) 0
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* checks for endianness */
|
||||
|
||||
#if defined (__linux__) || defined (__GLIBC__)
|
||||
# include <endian.h>
|
||||
#elif defined (__FreeBSD__)
|
||||
# include <machine/endian.h>
|
||||
#elif defined (__OpenBSD__)
|
||||
# include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BYTE_ORDER
|
||||
|
||||
# if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
# define SIMD_LITTLE_ENDIAN
|
||||
# elif __BYTE_ORDER == __BIG_ENDIAN
|
||||
# define SIMD_BIG_ENDIAN
|
||||
# endif
|
||||
|
||||
#else
|
||||
|
||||
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
|
||||
# define SIMD_LITTLE_ENDIAN
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user