mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2026-02-23 17:03:08 +00:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f2f9ec3e9 | ||
|
|
12480a3ea5 | ||
|
|
aa47e880d5 | ||
|
|
66191db93c | ||
|
|
dd99580a4c | ||
|
|
1ed18bf22e | ||
|
|
1d9341ee92 | ||
|
|
a45a333b40 | ||
|
|
2b1037a7c7 | ||
|
|
06624a0ff2 | ||
|
|
8e91bfbe19 | ||
|
|
47e24b50e8 | ||
|
|
c47c4a8885 | ||
|
|
042d13d1e1 | ||
|
|
4f930574cc |
55
Makefile.am
55
Makefile.am
@@ -1,27 +1,49 @@
|
|||||||
|
|
||||||
|
if HAVE_APPLE
|
||||||
|
# MacOS uses Homebrew to install needed packages but they aren't linked for
|
||||||
|
# the jansson test in configure. Ignore the failed test & link them now,
|
||||||
|
# different path for different CPU arch.
|
||||||
|
|
||||||
|
if ARCH_ARM64
|
||||||
|
EXTRA_INCLUDES = -I/opt/homebrew/include
|
||||||
|
EXTRA_LIBS = -L/opt/homebrew/lib
|
||||||
|
else
|
||||||
|
EXTRA_INCLUDES = -I/usr/local/include
|
||||||
|
EXTRA_LIBS = -L/usr/local/lib
|
||||||
|
endif
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
if WANT_JANSSON
|
if WANT_JANSSON
|
||||||
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
|
# Can't find jansson libraries, compile the included source code.
|
||||||
|
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
|
||||||
|
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
|
||||||
else
|
else
|
||||||
JANSSON_INCLUDES=
|
EXTRA_INCLUDES =
|
||||||
|
EXTRA_LIBS =
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||||
|
|
||||||
SUBDIRS = compat
|
SUBDIRS = compat
|
||||||
|
|
||||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
|
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
|
||||||
|
|
||||||
bin_PROGRAMS = cpuminer
|
bin_PROGRAMS = cpuminer
|
||||||
|
|
||||||
dist_man_MANS = cpuminer.1
|
dist_man_MANS = cpuminer.1
|
||||||
|
|
||||||
cpuminer_SOURCES = \
|
cpuminer_SOURCES = \
|
||||||
|
dummy.cpp \
|
||||||
cpu-miner.c \
|
cpu-miner.c \
|
||||||
util.c \
|
util.c \
|
||||||
api.c \
|
api.c \
|
||||||
sysinfos.c \
|
sysinfos.c \
|
||||||
algo-gate-api.c\
|
algo-gate-api.c\
|
||||||
malloc-huge.c \
|
malloc-huge.c \
|
||||||
|
simd-utils/simd-constants.c \
|
||||||
algo/argon2d/argon2d-gate.c \
|
algo/argon2d/argon2d-gate.c \
|
||||||
algo/argon2d/blake2/blake2b.c \
|
algo/argon2d/blake2/blake2b.c \
|
||||||
algo/argon2d/argon2d/argon2.c \
|
algo/argon2d/argon2d/argon2.c \
|
||||||
@@ -113,7 +135,6 @@ cpuminer_SOURCES = \
|
|||||||
algo/lyra2/phi2-4way.c \
|
algo/lyra2/phi2-4way.c \
|
||||||
algo/lyra2/phi2.c \
|
algo/lyra2/phi2.c \
|
||||||
algo/m7m/m7m.c \
|
algo/m7m/m7m.c \
|
||||||
algo/m7m/magimath.cpp \
|
|
||||||
algo/nist5/nist5-gate.c \
|
algo/nist5/nist5-gate.c \
|
||||||
algo/nist5/nist5-4way.c \
|
algo/nist5/nist5-4way.c \
|
||||||
algo/nist5/nist5.c \
|
algo/nist5/nist5.c \
|
||||||
@@ -166,9 +187,6 @@ cpuminer_SOURCES = \
|
|||||||
algo/shavite/sph-shavite-aesni.c \
|
algo/shavite/sph-shavite-aesni.c \
|
||||||
algo/shavite/shavite-hash-2way.c \
|
algo/shavite/shavite-hash-2way.c \
|
||||||
algo/shavite/shavite-hash-4way.c \
|
algo/shavite/shavite-hash-4way.c \
|
||||||
algo/shavite/shavite.c \
|
|
||||||
algo/simd/nist.c \
|
|
||||||
algo/simd/vector.c \
|
|
||||||
algo/simd/sph_simd.c \
|
algo/simd/sph_simd.c \
|
||||||
algo/simd/simd-hash-2way.c \
|
algo/simd/simd-hash-2way.c \
|
||||||
algo/skein/sph_skein.c \
|
algo/skein/sph_skein.c \
|
||||||
@@ -276,28 +294,28 @@ cpuminer_SOURCES = \
|
|||||||
algo/yespower/yespower-ref.c \
|
algo/yespower/yespower-ref.c \
|
||||||
algo/yespower/yespower-blake2b-ref.c
|
algo/yespower/yespower-blake2b-ref.c
|
||||||
|
|
||||||
disable_flags =
|
|
||||||
|
|
||||||
if USE_ASM
|
|
||||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
|
||||||
else
|
|
||||||
disable_flags += -DNOASM
|
|
||||||
endif
|
|
||||||
|
|
||||||
if HAVE_WINDOWS
|
if HAVE_WINDOWS
|
||||||
cpuminer_SOURCES += compat/winansi.c
|
cpuminer_SOURCES += compat/winansi.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if USE_ASM
|
||||||
|
disable_flags =
|
||||||
|
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||||
|
else
|
||||||
|
disable_flags = -DNOASM
|
||||||
|
endif
|
||||||
|
|
||||||
cpuminer_LDFLAGS = @LDFLAGS@
|
cpuminer_LDFLAGS = @LDFLAGS@
|
||||||
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||||
|
|
||||||
if HAVE_WINDOWS
|
if ARCH_ARM64
|
||||||
cpuminer_CFLAGS += -Wl,--stack,10485760
|
cpuminer_CFLAGS += -flax-vector-conversions
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if HAVE_WINDOWS
|
if HAVE_WINDOWS
|
||||||
|
|
||||||
# use to profile an object
|
# use to profile an object
|
||||||
# gprof_cflags = -pg -g3
|
# gprof_cflags = -pg -g3
|
||||||
# cpuminer_LDFLAGS += -pg
|
# cpuminer_LDFLAGS += -pg
|
||||||
@@ -311,5 +329,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
|
|||||||
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
|
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
|
||||||
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
|
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
|
||||||
|
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|||||||
34
README.md
34
README.md
@@ -36,44 +36,28 @@ for compile instructions.
|
|||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
|
||||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
|
||||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
|
||||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
|
||||||
|
|
||||||
32 bit CPUs are not supported.
|
|
||||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
|
||||||
are not supported.
|
|
||||||
|
|
||||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||||
designed for extreme heat of operating at full load for extended periods of
|
designed for extreme heat of operating at full load for extended periods of
|
||||||
time.
|
time.
|
||||||
|
|
||||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
|
||||||
|
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
|
||||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
|
||||||
including Mint and Centos, are known to work and have all dependencies
|
|
||||||
in their repositories. Others may work but may require more effort. Older
|
|
||||||
versions such as Centos 6 don't work due to missing features.
|
|
||||||
|
|
||||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
|
||||||
binaries. WindowsXP 64 bit is YMMV.
|
|
||||||
|
|
||||||
FreeBSD is not actively tested but should work, YMMV.
|
|
||||||
MacOS, OSx and Android are not supported.
|
|
||||||
|
|
||||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||||
RPC getwork using http:// or https://.
|
RPC getblocktemplate using http:// or https://.
|
||||||
GBT is YMMV.
|
|
||||||
|
|
||||||
Supported Algorithms
|
Supported Algorithms
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
allium Garlicoin
|
allium Garlicoin
|
||||||
anime Animecoin
|
anime Animecoin
|
||||||
argon2 Argon2 coin (AR2)
|
argon2d250
|
||||||
argon2d250 argon2d-crds, Credits (CRDS)
|
argon2d500
|
||||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
argon2d1000
|
||||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
argon2d4096
|
||||||
blake Blake-256
|
blake Blake-256
|
||||||
blake2b Blake2-512
|
blake2b Blake2-512
|
||||||
blake2s Blake2-256
|
blake2s Blake2-256
|
||||||
|
|||||||
104
RELEASE_NOTES
104
RELEASE_NOTES
@@ -75,6 +75,110 @@ If not what makes it happen or not happen?
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v25.7
|
||||||
|
|
||||||
|
Fixed a bug calculating TTF longer than 1 year.
|
||||||
|
Faster argon2d.
|
||||||
|
Faster hamsi AVX512.
|
||||||
|
Faster switfftx AVX2.
|
||||||
|
Other small fixes and improvements.
|
||||||
|
|
||||||
|
v25.6
|
||||||
|
|
||||||
|
Added argon2d1000, argon2d16000 algos.
|
||||||
|
Target specific AES optimizations improve shavite for ARM64 & x86_64.
|
||||||
|
|
||||||
|
v25.5
|
||||||
|
|
||||||
|
x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
|
||||||
|
x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
|
||||||
|
Other small bug fixes.
|
||||||
|
|
||||||
|
v25.4
|
||||||
|
|
||||||
|
x86_64: improved handling of vector constants used for byte permutations.
|
||||||
|
x86_64: removed hooks for cancelled AVX10-256.
|
||||||
|
Minor bug fixes & improvements.
|
||||||
|
More code cleanup.
|
||||||
|
|
||||||
|
v25.3
|
||||||
|
|
||||||
|
#442, #443: Fixed a regression in Makefile.am.
|
||||||
|
Removed algo features log display.
|
||||||
|
Some code cleanup.
|
||||||
|
|
||||||
|
v25.2
|
||||||
|
|
||||||
|
ARM: Fixed regression from v25.1 that could cause build fail.
|
||||||
|
BSD: FreeBSD is now supported. Other BSDs may also work.
|
||||||
|
MacOS: build with installed jansson library instead of compiling the included source code.
|
||||||
|
Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
|
||||||
|
Changed build.sh shell from bash to sh.
|
||||||
|
|
||||||
|
v25.1
|
||||||
|
|
||||||
|
MacOS ARM64: m7m algo is now working.
|
||||||
|
MacOS ARM64: can now be compiled with GCC.
|
||||||
|
MacOS x86_64: is now working compiled with GCC.
|
||||||
|
Fixed some minor bugs & removed some obsolete code.
|
||||||
|
|
||||||
|
v24.8
|
||||||
|
|
||||||
|
ARM: Apple MacOS on M series CPU is now supported compiled from source
|
||||||
|
code, see Wiki for details.
|
||||||
|
ARM: Fix incorrect compiler version display when using clang.
|
||||||
|
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
|
||||||
|
have been removed.
|
||||||
|
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
|
||||||
|
continue to be compiled with CPU groups disabled.
|
||||||
|
|
||||||
|
v24.7
|
||||||
|
|
||||||
|
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
|
||||||
|
|
||||||
|
v24.6
|
||||||
|
|
||||||
|
ARM: Fixed scryptn2, x16*, broken in v24.2.
|
||||||
|
ARM: Small improvement to interleaving.
|
||||||
|
Eliminated some potential compile errors in code that was dependent on
|
||||||
|
compiler optimisations.
|
||||||
|
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
|
||||||
|
|
||||||
|
v24.5
|
||||||
|
|
||||||
|
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||||
|
#427: GBT: Improved handling of new work.
|
||||||
|
Removed shavite3 algo.
|
||||||
|
|
||||||
|
v24.4
|
||||||
|
|
||||||
|
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
|
||||||
|
x86_64: fixed a bug in alignr macros for SSE2.
|
||||||
|
ARM: CPU feature reporting enhancements.
|
||||||
|
Some code cleanup.
|
||||||
|
|
||||||
|
v24.3
|
||||||
|
|
||||||
|
ARM: CPU feature detection and reporting is now working.
|
||||||
|
ARM: Verthash is now working.
|
||||||
|
ARM: Small speedup for yescrypt, yespower & argon2d.
|
||||||
|
Code cleanup.
|
||||||
|
|
||||||
|
v24.2
|
||||||
|
|
||||||
|
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
|
||||||
|
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
|
||||||
|
ARM NEON: Various code optimisations.
|
||||||
|
|
||||||
|
v24.1
|
||||||
|
|
||||||
|
#414: fix bug in merkle error handling.
|
||||||
|
#416: change $nproc to $(nproc) in build scripts.
|
||||||
|
#420: change some inline function definitions to static inline.
|
||||||
|
#413: Fix formatting error for share result log when using no-color.
|
||||||
|
Faster 2 way interleaving.
|
||||||
|
Cleanup sha256 architecture targetting.
|
||||||
|
|
||||||
v23.15
|
v23.15
|
||||||
|
|
||||||
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
|
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
||||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
|
|||||||
gate->build_block_header = (void*)&std_build_block_header;
|
gate->build_block_header = (void*)&std_build_block_header;
|
||||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||||
gate->set_work_data_endian = (void*)&do_nothing;
|
gate->set_work_data_endian = (void*)&do_nothing;
|
||||||
gate->resync_threads = (void*)&do_nothing;
|
// gate->resync_threads = (void*)&do_nothing;
|
||||||
gate->do_this_thread = (void*)&return_true;
|
// gate->do_this_thread = (void*)&return_true;
|
||||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||||
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
||||||
gate->optimizations = EMPTY_SET;
|
gate->optimizations = EMPTY_SET;
|
||||||
@@ -295,8 +295,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
|||||||
{
|
{
|
||||||
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||||
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
|
||||||
|
case ALGO_ARGON2D1000: rc = register_argon2d1000_algo ( gate ); break;
|
||||||
|
case ALGO_ARGON2D16000: rc = register_argon2d16000_algo ( gate ); break;
|
||||||
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||||
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
||||||
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
||||||
@@ -340,7 +342,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
|||||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
|
||||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||||
@@ -417,8 +418,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
|||||||
const char* const algo_alias_map[][2] =
|
const char* const algo_alias_map[][2] =
|
||||||
{
|
{
|
||||||
// alias proper
|
// alias proper
|
||||||
{ "argon2d-dyn", "argon2d500" },
|
|
||||||
{ "argon2d-uis", "argon2d4096" },
|
|
||||||
{ "bcd", "x13bcd" },
|
{ "bcd", "x13bcd" },
|
||||||
{ "bitcore", "timetravel10" },
|
{ "bitcore", "timetravel10" },
|
||||||
{ "bitzeny", "yescryptr8" },
|
{ "bitzeny", "yescryptr8" },
|
||||||
|
|||||||
@@ -98,25 +98,27 @@ typedef uint32_t set_t;
|
|||||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||||
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||||
#define NEON_OPT 1 << 11 // AArch64
|
#define NEON_OPT 1 << 11 // AArch64
|
||||||
|
#define AVX10_256 1 << 12
|
||||||
|
#define AVX10_512 1 << 13
|
||||||
|
|
||||||
// AVX10 does not have explicit algo features:
|
// AVX10 does not have explicit algo features:
|
||||||
// AVX10_512 is compatible with AVX512 + VAES
|
// AVX10_512 is compatible with AVX512 + VAES
|
||||||
// AVX10_256 is compatible with AVX2 + VAES
|
// AVX10_256 is compatible with AVX2 + VAES
|
||||||
|
|
||||||
// return set containing all elements from sets a & b
|
// return set containing all elements from sets a & b
|
||||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||||
|
|
||||||
// return set contained common elements from sets a & b
|
// return set contained common elements from sets a & b
|
||||||
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
||||||
|
|
||||||
// all elements in set a are included in set b
|
// all elements in set a are included in set b
|
||||||
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
||||||
|
|
||||||
// no elements in set a are included in set b
|
// no elements in set a are included in set b
|
||||||
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -163,15 +165,18 @@ char* ( *malloc_txs_request ) ( struct work* );
|
|||||||
void ( *set_work_data_endian ) ( struct work* );
|
void ( *set_work_data_endian ) ( struct work* );
|
||||||
|
|
||||||
// Diverge mining threads
|
// Diverge mining threads
|
||||||
bool ( *do_this_thread ) ( int );
|
//bool ( *do_this_thread ) ( int );
|
||||||
|
|
||||||
// After do_this_thread
|
// After do_this_thread
|
||||||
void ( *resync_threads ) ( int, struct work* );
|
//void ( *resync_threads ) ( int, struct work* );
|
||||||
|
|
||||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||||
|
|
||||||
|
// Deprecated
|
||||||
set_t optimizations;
|
set_t optimizations;
|
||||||
|
|
||||||
int ( *get_work_data_size ) ();
|
int ( *get_work_data_size ) ();
|
||||||
|
|
||||||
int ntime_index;
|
int ntime_index;
|
||||||
int nbits_index;
|
int nbits_index;
|
||||||
int nonce_index; // use with caution, see warning below
|
int nonce_index; // use with caution, see warning below
|
||||||
@@ -246,7 +251,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
@@ -272,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
|||||||
|
|
||||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||||
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||||
// OpenSSL sha256 deprecated
|
|
||||||
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
|
||||||
|
|
||||||
bool std_le_work_decode( struct work *work );
|
bool std_le_work_decode( struct work *work );
|
||||||
bool std_be_work_decode( struct work *work );
|
bool std_be_work_decode( struct work *work );
|
||||||
|
|||||||
@@ -6,9 +6,39 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
|
|||||||
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
|
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
|
||||||
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
|
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
|
||||||
|
|
||||||
// Credits
|
// generic, works with most variations of argon2d
|
||||||
|
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t _ALIGN(64) edata[20];
|
||||||
|
uint32_t _ALIGN(64) hash[8];
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const uint32_t first_nonce = (const uint32_t)pdata[19];
|
||||||
|
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||||
|
uint32_t nonce = first_nonce;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
|
||||||
void argon2d_crds_hash( void *output, const void *input )
|
v128_bswap32_80( edata, pdata );
|
||||||
|
do
|
||||||
|
{
|
||||||
|
edata[19] = nonce;
|
||||||
|
algo_gate.hash( hash, edata, thr_id );
|
||||||
|
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||||
|
{
|
||||||
|
pdata[19] = bswap_32( nonce );
|
||||||
|
submit_solution( work, hash, mythr );
|
||||||
|
}
|
||||||
|
nonce++;
|
||||||
|
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
|
||||||
|
|
||||||
|
pdata[19] = nonce;
|
||||||
|
*hashes_done = pdata[19] - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void argon2d250_hash( void *output, const void *input )
|
||||||
{
|
{
|
||||||
argon2_context context;
|
argon2_context context;
|
||||||
context.out = (uint8_t *)output;
|
context.out = (uint8_t *)output;
|
||||||
@@ -34,48 +64,15 @@ void argon2d_crds_hash( void *output, const void *input )
|
|||||||
argon2_ctx( &context, Argon2_d );
|
argon2_ctx( &context, Argon2_d );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
bool register_argon2d250_algo( algo_gate_t* gate )
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
|
||||||
{
|
{
|
||||||
uint32_t _ALIGN(64) edata[20];
|
gate->scanhash = (void*)&scanhash_argon2d;
|
||||||
uint32_t _ALIGN(64) hash[8];
|
gate->hash = (void*)&argon2d250_hash;
|
||||||
uint32_t *pdata = work->data;
|
|
||||||
uint32_t *ptarget = work->target;
|
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
|
||||||
const uint32_t first_nonce = pdata[19];
|
|
||||||
const uint32_t Htarg = ptarget[7];
|
|
||||||
uint32_t nonce = first_nonce;
|
|
||||||
|
|
||||||
swab32_array( edata, pdata, 20 );
|
|
||||||
|
|
||||||
do {
|
|
||||||
be32enc(&edata[19], nonce);
|
|
||||||
argon2d_crds_hash( hash, edata );
|
|
||||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
|
||||||
{
|
|
||||||
pdata[19] = nonce;
|
|
||||||
submit_solution( work, hash, mythr );
|
|
||||||
}
|
|
||||||
nonce++;
|
|
||||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
|
||||||
|
|
||||||
pdata[19] = nonce;
|
|
||||||
*hashes_done = pdata[19] - first_nonce + 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool register_argon2d_crds_algo( algo_gate_t* gate )
|
|
||||||
{
|
|
||||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
|
||||||
gate->hash = (void*)&argon2d_crds_hash;
|
|
||||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
|
||||||
opt_target_factor = 65536.0;
|
opt_target_factor = 65536.0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dynamic
|
void argon2d500_hash( void *output, const void *input )
|
||||||
|
|
||||||
void argon2d_dyn_hash( void *output, const void *input )
|
|
||||||
{
|
{
|
||||||
argon2_context context;
|
argon2_context context;
|
||||||
context.out = (uint8_t *)output;
|
context.out = (uint8_t *)output;
|
||||||
@@ -101,48 +98,81 @@ void argon2d_dyn_hash( void *output, const void *input )
|
|||||||
argon2_ctx( &context, Argon2_d );
|
argon2_ctx( &context, Argon2_d );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
bool register_argon2d500_algo( algo_gate_t* gate )
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
|
||||||
{
|
{
|
||||||
uint32_t _ALIGN(64) edata[20];
|
gate->scanhash = (void*)&scanhash_argon2d;
|
||||||
uint32_t _ALIGN(64) hash[8];
|
gate->hash = (void*)&argon2d500_hash;
|
||||||
uint32_t *pdata = work->data;
|
|
||||||
uint32_t *ptarget = work->target;
|
|
||||||
const int thr_id = mythr->id;
|
|
||||||
const uint32_t first_nonce = (const uint32_t)pdata[19];
|
|
||||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
|
||||||
uint32_t nonce = first_nonce;
|
|
||||||
const bool bench = opt_benchmark;
|
|
||||||
|
|
||||||
v128_bswap32_80( edata, pdata );
|
|
||||||
do
|
|
||||||
{
|
|
||||||
edata[19] = nonce;
|
|
||||||
argon2d_dyn_hash( hash, edata );
|
|
||||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
|
||||||
&& !bench ) )
|
|
||||||
{
|
|
||||||
pdata[19] = bswap_32( nonce );;
|
|
||||||
submit_solution( work, hash, mythr );
|
|
||||||
}
|
|
||||||
nonce++;
|
|
||||||
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
|
|
||||||
|
|
||||||
pdata[19] = nonce;
|
|
||||||
*hashes_done = pdata[19] - first_nonce;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
|
||||||
{
|
|
||||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
|
||||||
gate->hash = (void*)&argon2d_dyn_hash;
|
|
||||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
|
||||||
opt_target_factor = 65536.0;
|
opt_target_factor = 65536.0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unitus
|
void argon2d1000_hash( void *output, const void *input )
|
||||||
|
{
|
||||||
|
argon2_context context;
|
||||||
|
context.out = (uint8_t *)output;
|
||||||
|
context.outlen = (uint32_t)OUTPUT_BYTES;
|
||||||
|
context.pwd = (uint8_t *)input;
|
||||||
|
context.pwdlen = (uint32_t)INPUT_BYTES;
|
||||||
|
context.salt = (uint8_t *)input; //salt = input
|
||||||
|
context.saltlen = (uint32_t)INPUT_BYTES;
|
||||||
|
context.secret = NULL;
|
||||||
|
context.secretlen = 0;
|
||||||
|
context.ad = NULL;
|
||||||
|
context.adlen = 0;
|
||||||
|
context.allocate_cbk = NULL;
|
||||||
|
context.free_cbk = NULL;
|
||||||
|
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
|
||||||
|
// main configurable Argon2 hash parameters
|
||||||
|
context.m_cost = 1000; // Memory in KiB (1MB)
|
||||||
|
context.lanes = 8; // Degree of Parallelism
|
||||||
|
context.threads = 1; // Threads
|
||||||
|
context.t_cost = 2; // Iterations
|
||||||
|
context.version = ARGON2_VERSION_10;
|
||||||
|
|
||||||
|
argon2_ctx( &context, Argon2_d );
|
||||||
|
}
|
||||||
|
|
||||||
|
bool register_argon2d1000_algo( algo_gate_t* gate )
|
||||||
|
{
|
||||||
|
gate->scanhash = (void*)&scanhash_argon2d;
|
||||||
|
gate->hash = (void*)&argon2d1000_hash;
|
||||||
|
opt_target_factor = 65536.0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void argon2d16000_hash( void *output, const void *input )
|
||||||
|
{
|
||||||
|
argon2_context context;
|
||||||
|
context.out = (uint8_t *)output;
|
||||||
|
context.outlen = (uint32_t)OUTPUT_BYTES;
|
||||||
|
context.pwd = (uint8_t *)input;
|
||||||
|
context.pwdlen = (uint32_t)INPUT_BYTES;
|
||||||
|
context.salt = (uint8_t *)input; //salt = input
|
||||||
|
context.saltlen = (uint32_t)INPUT_BYTES;
|
||||||
|
context.secret = NULL;
|
||||||
|
context.secretlen = 0;
|
||||||
|
context.ad = NULL;
|
||||||
|
context.adlen = 0;
|
||||||
|
context.allocate_cbk = NULL;
|
||||||
|
context.free_cbk = NULL;
|
||||||
|
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
|
||||||
|
// main configurable Argon2 hash parameters
|
||||||
|
context.m_cost = 16000; // Memory in KiB (~16384KB)
|
||||||
|
context.lanes = 1; // Degree of Parallelism
|
||||||
|
context.threads = 1; // Threads
|
||||||
|
context.t_cost = 1; // Iterations
|
||||||
|
context.version = ARGON2_VERSION_10;
|
||||||
|
|
||||||
|
argon2_ctx( &context, Argon2_d );
|
||||||
|
}
|
||||||
|
|
||||||
|
bool register_argon2d16000_algo( algo_gate_t* gate )
|
||||||
|
{
|
||||||
|
gate->scanhash = (void*)&scanhash_argon2d;
|
||||||
|
gate->hash = (void*)&argon2d16000_hash;
|
||||||
|
opt_target_factor = 65536.0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
@@ -154,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
|||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const int thr_id = mythr->id; // thr_id arg is deprecated
|
const int thr_id = mythr->id;
|
||||||
uint32_t t_cost = 1; // 1 iteration
|
uint32_t t_cost = 1; // 1 iteration
|
||||||
uint32_t m_cost = 4096; // use 4MB
|
uint32_t m_cost = 4096; // use 4MB
|
||||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||||
@@ -182,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
|||||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
|
|
||||||
opt_target_factor = 65536.0;
|
opt_target_factor = 65536.0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,22 +4,27 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
// Credits: version = 0x10, m_cost = 250.
|
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
|
||||||
bool register_argon2d_crds_algo( algo_gate_t* gate );
|
|
||||||
|
|
||||||
void argon2d_crds_hash( void *state, const void *input );
|
|
||||||
|
|
||||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
// Credits: version = 0x10, m_cost = 250.
|
||||||
|
bool register_argon2d250_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
|
void argon2d250_hash( void *state, const void *input );
|
||||||
|
|
||||||
// Dynamic: version = 0x10, m_cost = 500.
|
// Dynamic: version = 0x10, m_cost = 500.
|
||||||
bool register_argon2d_dyn_algo( algo_gate_t* gate );
|
bool register_argon2d500_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
void argon2d_dyn_hash( void *state, const void *input );
|
void argon2d500_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
bool register_argon2d1000_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
|
void argon2d1000_hash( void *state, const void *input );
|
||||||
|
|
||||||
|
bool register_argon2d16000_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
|
void argon2d16000_hash( void *state, const void *input );
|
||||||
|
|
||||||
// Unitus: version = 0x13, m_cost = 4096.
|
// Unitus: version = 0x13, m_cost = 4096.
|
||||||
bool register_argon2d4096_algo( algo_gate_t* gate );
|
bool register_argon2d4096_algo( algo_gate_t* gate );
|
||||||
|
|||||||
@@ -35,7 +35,7 @@
|
|||||||
* @pre all block pointers must be valid
|
* @pre all block pointers must be valid
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static inline __m512i blamka( __m512i x, __m512i y )
|
static inline __m512i blamka( __m512i x, __m512i y )
|
||||||
{
|
{
|
||||||
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
|
|||||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||||
uint32_t prev_offset, curr_offset;
|
uint32_t prev_offset, curr_offset;
|
||||||
uint32_t starting_index, i;
|
uint32_t starting_index, i;
|
||||||
#if defined(__AVX512F__)
|
#if defined(SIMD512)
|
||||||
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
#include "blake2-impl.h"
|
#include "blake2-impl.h"
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if !defined(__AVX512F__)
|
#if !defined(SIMD512)
|
||||||
|
|
||||||
#if !defined(__AVX2__)
|
#if !defined(__AVX2__)
|
||||||
|
|
||||||
@@ -67,81 +67,59 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
do { \
|
{ \
|
||||||
v128_t t0 = v128_alignr8(B1, B0, 8); \
|
v128_t t = v128_alignr8( B1, B0, 8 ); \
|
||||||
v128_t t1 = v128_alignr8(B0, B1, 8); \
|
B1 = v128_alignr8( B0, B1, 8 ); \
|
||||||
B0 = t0; \
|
B0 = t; \
|
||||||
B1 = t1; \
|
t = v128_alignr8( D1, D0, 8 ); \
|
||||||
\
|
D0 = v128_alignr8( D0, D1, 8 ); \
|
||||||
t0 = C0; \
|
D1 = t; \
|
||||||
C0 = C1; \
|
}
|
||||||
C1 = t0; \
|
|
||||||
\
|
|
||||||
t0 = v128_alignr8(D1, D0, 8); \
|
|
||||||
t1 = v128_alignr8(D0, D1, 8); \
|
|
||||||
D0 = t1; \
|
|
||||||
D1 = t0; \
|
|
||||||
} while ((void)0, 0)
|
|
||||||
|
|
||||||
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
do { \
|
{ \
|
||||||
v128_t t0 = v128_alignr8(B0, B1, 8); \
|
v128_t t = v128_alignr8( B0, B1, 8 ); \
|
||||||
v128_t t1 = v128_alignr8(B1, B0, 8); \
|
B1 = v128_alignr8( B1, B0, 8 ); \
|
||||||
B0 = t0; \
|
B0 = t; \
|
||||||
B1 = t1; \
|
t = v128_alignr8( D0, D1, 8 ); \
|
||||||
\
|
D0 = v128_alignr8( D1, D0, 8 ); \
|
||||||
t0 = C0; \
|
D1 = t; \
|
||||||
C0 = C1; \
|
}
|
||||||
C1 = t0; \
|
|
||||||
\
|
|
||||||
t0 = v128_alignr8(D0, D1, 8); \
|
|
||||||
t1 = v128_alignr8(D1, D0, 8); \
|
|
||||||
D0 = t1; \
|
|
||||||
D1 = t0; \
|
|
||||||
} while ((void)0, 0)
|
|
||||||
|
|
||||||
#else /* SSE2 */
|
#else /* SSE2 */
|
||||||
|
|
||||||
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
do { \
|
{ \
|
||||||
v128_t t0 = D0; \
|
v128_t t = D0; \
|
||||||
v128_t t1 = B0; \
|
D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
|
||||||
D0 = C0; \
|
D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
|
||||||
C0 = C1; \
|
t = B0; \
|
||||||
C1 = D0; \
|
|
||||||
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \
|
|
||||||
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \
|
|
||||||
B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
|
B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
|
||||||
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \
|
B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
|
||||||
} while ((void)0, 0)
|
}
|
||||||
|
|
||||||
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
do { \
|
{ \
|
||||||
v128_t t0, t1; \
|
v128_t t = B0; \
|
||||||
t0 = C0; \
|
|
||||||
C0 = C1; \
|
|
||||||
C1 = t0; \
|
|
||||||
t0 = B0; \
|
|
||||||
t1 = D0; \
|
|
||||||
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
|
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
|
||||||
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \
|
B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
|
||||||
|
t = D0; \
|
||||||
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
|
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
|
||||||
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \
|
D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
|
||||||
} while ((void)0, 0)
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
|
#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
|
||||||
do { \
|
{ \
|
||||||
G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
||||||
G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
||||||
\
|
|
||||||
DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
||||||
\
|
G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
|
||||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
|
||||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
|
||||||
\
|
|
||||||
UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
|
||||||
} while ((void)0, 0)
|
}
|
||||||
|
|
||||||
#else /* __AVX2__ */
|
#else /* __AVX2__ */
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
\
|
|
||||||
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
|
|
||||||
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
do { \
|
do { \
|
||||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||||
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
|
||||||
tmp1 = C0; \
|
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
|
||||||
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
|
||||||
C0 = C1; \
|
|
||||||
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
|
||||||
C1 = tmp1; \
|
|
||||||
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
|
||||||
|
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
|
||||||
} while(0);
|
} while(0);
|
||||||
|
|
||||||
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
\
|
|
||||||
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
do { \
|
do { \
|
||||||
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
||||||
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
||||||
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
|
||||||
tmp1 = C0; \
|
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
|
||||||
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
|
||||||
C0 = C1; \
|
|
||||||
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
||||||
C1 = tmp1; \
|
|
||||||
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
||||||
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
|
||||||
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
|
||||||
} while((void)0, 0);
|
} while((void)0, 0);
|
||||||
|
|
||||||
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
do{ \
|
do{ \
|
||||||
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
\
|
|
||||||
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
\
|
|
||||||
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
\
|
|
||||||
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
} while((void)0, 0);
|
} while((void)0, 0);
|
||||||
|
|
||||||
@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
do{ \
|
do{ \
|
||||||
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
\
|
|
||||||
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
\
|
G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
|
||||||
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
|
||||||
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
|
||||||
\
|
|
||||||
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||||
} while((void)0, 0);
|
} while((void)0, 0);
|
||||||
|
|
||||||
@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
|||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
/*
|
||||||
static inline __m512i muladd(__m512i x, __m512i y)
|
static inline __m512i muladd(__m512i x, __m512i y)
|
||||||
{
|
{
|
||||||
__m512i z = _mm512_mul_epu32(x, y);
|
__m512i z = _mm512_mul_epu32(x, y);
|
||||||
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
|
{ \
|
||||||
|
__m512i z0, z1; \
|
||||||
|
z0 = _mm512_mul_epu32( A0, B0 ); \
|
||||||
|
z1 = _mm512_mul_epu32( A1, B1 ); \
|
||||||
|
A0 = _mm512_add_epi64( A0, B0 ); \
|
||||||
|
A1 = _mm512_add_epi64( A1, B1 ); \
|
||||||
|
z0 = _mm512_add_epi64( z0, z0 ); \
|
||||||
|
z1 = _mm512_add_epi64( z1, z1 ); \
|
||||||
|
A0 = _mm512_add_epi64( A0, z0 ); \
|
||||||
|
A1 = _mm512_add_epi64( A1, z1 ); \
|
||||||
|
D0 = _mm512_xor_si512(D0, A0); \
|
||||||
|
D1 = _mm512_xor_si512(D1, A1); \
|
||||||
|
D0 = _mm512_ror_epi64(D0, 32); \
|
||||||
|
D1 = _mm512_ror_epi64(D1, 32); \
|
||||||
|
z0 = _mm512_mul_epu32( C0, D0 ); \
|
||||||
|
z1 = _mm512_mul_epu32( C1, D1 ); \
|
||||||
|
C0 = _mm512_add_epi64( C0, D0 ); \
|
||||||
|
C1 = _mm512_add_epi64( C1, D1 ); \
|
||||||
|
z0 = _mm512_add_epi64( z0, z0 ); \
|
||||||
|
z1 = _mm512_add_epi64( z1, z1 ); \
|
||||||
|
C0 = _mm512_add_epi64( C0, z0 ); \
|
||||||
|
C1 = _mm512_add_epi64( C1, z1 ); \
|
||||||
|
B0 = _mm512_xor_si512(B0, C0); \
|
||||||
|
B1 = _mm512_xor_si512(B1, C1); \
|
||||||
|
B0 = _mm512_ror_epi64(B0, 24); \
|
||||||
|
B1 = _mm512_ror_epi64(B1, 24); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||||
|
{ \
|
||||||
|
__m512i z0, z1; \
|
||||||
|
z0 = _mm512_mul_epu32( A0, B0 ); \
|
||||||
|
z1 = _mm512_mul_epu32( A1, B1 ); \
|
||||||
|
A0 = _mm512_add_epi64( A0, B0 ); \
|
||||||
|
A1 = _mm512_add_epi64( A1, B1 ); \
|
||||||
|
z0 = _mm512_add_epi64( z0, z0 ); \
|
||||||
|
z1 = _mm512_add_epi64( z1, z1 ); \
|
||||||
|
A0 = _mm512_add_epi64( A0, z0 ); \
|
||||||
|
A1 = _mm512_add_epi64( A1, z1 ); \
|
||||||
|
D0 = _mm512_xor_si512(D0, A0); \
|
||||||
|
D1 = _mm512_xor_si512(D1, A1); \
|
||||||
|
D0 = _mm512_ror_epi64(D0, 16); \
|
||||||
|
D1 = _mm512_ror_epi64(D1, 16); \
|
||||||
|
z0 = _mm512_mul_epu32( C0, D0 ); \
|
||||||
|
z1 = _mm512_mul_epu32( C1, D1 ); \
|
||||||
|
C0 = _mm512_add_epi64( C0, D0 ); \
|
||||||
|
C1 = _mm512_add_epi64( C1, D1 ); \
|
||||||
|
z0 = _mm512_add_epi64( z0, z0 ); \
|
||||||
|
z1 = _mm512_add_epi64( z1, z1 ); \
|
||||||
|
C0 = _mm512_add_epi64( C0, z0 ); \
|
||||||
|
C1 = _mm512_add_epi64( C1, z1 ); \
|
||||||
|
B0 = _mm512_xor_si512(B0, C0); \
|
||||||
|
B1 = _mm512_xor_si512(B1, C1); \
|
||||||
|
B0 = _mm512_ror_epi64(B0, 63); \
|
||||||
|
B1 = _mm512_ror_epi64(B1, 63); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
do { \
|
do { \
|
||||||
A0 = muladd(A0, B0); \
|
A0 = muladd(A0, B0); \
|
||||||
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
|
|||||||
B0 = _mm512_ror_epi64(B0, 24); \
|
B0 = _mm512_ror_epi64(B0, 24); \
|
||||||
B1 = _mm512_ror_epi64(B1, 24); \
|
B1 = _mm512_ror_epi64(B1, 24); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
*/
|
||||||
|
/*
|
||||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
do { \
|
do { \
|
||||||
A0 = muladd(A0, B0); \
|
A0 = muladd(A0, B0); \
|
||||||
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
|
|||||||
B0 = _mm512_ror_epi64(B0, 63); \
|
B0 = _mm512_ror_epi64(B0, 63); \
|
||||||
B1 = _mm512_ror_epi64(B1, 63); \
|
B1 = _mm512_ror_epi64(B1, 63); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
*/
|
||||||
|
|
||||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||||
do { \
|
do { \
|
||||||
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
\
|
|
||||||
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
\
|
|
||||||
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
|
|||||||
do { \
|
do { \
|
||||||
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
||||||
\
|
|
||||||
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||||
\
|
|
||||||
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
|
|||||||
do { \
|
do { \
|
||||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
\
|
|
||||||
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
\
|
|
||||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
\
|
|
||||||
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||||
} while ((void)0, 0)
|
} while ((void)0, 0)
|
||||||
|
|
||||||
|
static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 };
|
||||||
|
static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 };
|
||||||
|
static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 };
|
||||||
|
static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 };
|
||||||
|
|
||||||
#define SWAP_HALVES(A0, A1) \
|
#define SWAP_HALVES(A0, A1) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t; \
|
__m512i t; \
|
||||||
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
|
|||||||
A0 = t; \
|
A0 = t; \
|
||||||
} while((void)0, 0)
|
} while((void)0, 0)
|
||||||
|
|
||||||
|
#define SWAP_QUARTERS(A0, A1) \
|
||||||
|
{ \
|
||||||
|
__m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
|
||||||
|
A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
|
||||||
|
A0 = t; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define UNSWAP_QUARTERS(A0, A1) \
|
||||||
|
{ \
|
||||||
|
__m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
|
||||||
|
A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
|
||||||
|
A0 = t; \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
#define SWAP_QUARTERS(A0, A1) \
|
#define SWAP_QUARTERS(A0, A1) \
|
||||||
do { \
|
do { \
|
||||||
SWAP_HALVES(A0, A1); \
|
SWAP_HALVES(A0, A1); \
|
||||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||||
} while((void)0, 0)
|
} while((void)0, 0)
|
||||||
|
*/
|
||||||
|
/*
|
||||||
#define UNSWAP_QUARTERS(A0, A1) \
|
#define UNSWAP_QUARTERS(A0, A1) \
|
||||||
do { \
|
do { \
|
||||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||||
SWAP_HALVES(A0, A1); \
|
SWAP_HALVES(A0, A1); \
|
||||||
} while((void)0, 0)
|
} while((void)0, 0)
|
||||||
|
*/
|
||||||
|
|
||||||
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
|
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
|
||||||
do { \
|
do { \
|
||||||
|
|||||||
@@ -6,15 +6,15 @@
|
|||||||
|
|
||||||
#if defined (BLAKE_4WAY)
|
#if defined (BLAKE_4WAY)
|
||||||
|
|
||||||
blake256r14_4way_context blake_4w_ctx;
|
blake256r14_4x32_context blake_4w_ctx;
|
||||||
|
|
||||||
void blakehash_4way(void *state, const void *input)
|
void blakehash_4way(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
blake256r14_4way_context ctx;
|
blake256r14_4x32_context ctx;
|
||||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||||
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
|
blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
|
||||||
blake256r14_4way_close( &ctx, vhash );
|
blake256r14_4x32_close( &ctx, vhash );
|
||||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
|||||||
HTarget = 0x7f;
|
HTarget = 0x7f;
|
||||||
|
|
||||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||||
blake256r14_4way_init( &blake_4w_ctx );
|
blake256r14_4x32_init( &blake_4w_ctx );
|
||||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#if defined(BLAKE_8WAY)
|
#if defined(BLAKE_8WAY)
|
||||||
|
|
||||||
blake256r14_8way_context blake_8w_ctx;
|
blake256r14_8x32_context blake_8w_ctx;
|
||||||
|
|
||||||
void blakehash_8way( void *state, const void *input )
|
void blakehash_8way( void *state, const void *input )
|
||||||
{
|
{
|
||||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||||
blake256r14_8way_context ctx;
|
blake256r14_8x32_context ctx;
|
||||||
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
|
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
|
||||||
blake256r14_8way( &ctx, input + (64<<3), 16 );
|
blake256r14_8x32( &ctx, input + (64<<3), 16 );
|
||||||
blake256r14_8way_close( &ctx, vhash );
|
blake256r14_8x32_close( &ctx, vhash );
|
||||||
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
|
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
|
||||||
state+128, state+160, state+192, state+224,
|
state+128, state+160, state+192, state+224,
|
||||||
vhash, 256 );
|
vhash, 256 );
|
||||||
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||||
|
|
||||||
blake256r14_8way_init( &blake_8w_ctx );
|
blake256r14_8x32_init( &blake_8w_ctx );
|
||||||
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
|
blake256r14_8x32( &blake_8w_ctx, vdata, 64 );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||||
|
|||||||
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
|||||||
(state)->T1 = T1; \
|
(state)->T1 = T1; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
#if defined(__SSSE3__)
|
|
||||||
|
|
||||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
|
||||||
{ \
|
|
||||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
|
||||||
0x0405060700010203 ); \
|
|
||||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
|
||||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
|
||||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
|
||||||
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
|
|
||||||
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
|
|
||||||
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
|
|
||||||
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
|
|
||||||
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
|
|
||||||
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
|
|
||||||
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
|
|
||||||
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
|
|
||||||
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
|
|
||||||
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
|
|
||||||
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
|
|
||||||
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
|
|
||||||
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // SSE2
|
|
||||||
|
|
||||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||||
{ \
|
{ \
|
||||||
M0 = v128_bswap32( buf[0] ); \
|
M0 = v128_bswap32( buf[0] ); \
|
||||||
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
|||||||
MF = v128_bswap32( buf[15] ); \
|
MF = v128_bswap32( buf[15] ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSSE3 else SSE2
|
|
||||||
|
|
||||||
#define COMPRESS32_4X32( rounds ) \
|
#define COMPRESS32_4X32( rounds ) \
|
||||||
{ \
|
{ \
|
||||||
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
|
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||||
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
ROUND_S_4X32_3;
|
ROUND_S_4X32_3;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__SSSE3__)
|
|
||||||
|
|
||||||
const v128_t shuf_bswap32 =
|
|
||||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
|
||||||
|
|
||||||
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
|
||||||
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
|
||||||
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
|
||||||
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
|
||||||
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
|
||||||
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
|
||||||
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
|
||||||
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
|
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
|
||||||
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
|
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
|
||||||
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
|
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
|
||||||
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
|
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
|
||||||
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
|
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
|
||||||
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
|
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
@@ -1291,24 +1244,22 @@ do { \
|
|||||||
VD = v256_32( T0 ^ 0x299F31D0 ); \
|
VD = v256_32( T0 ^ 0x299F31D0 ); \
|
||||||
VE = v256_32( T1 ^ 0x082EFA98 ); \
|
VE = v256_32( T1 ^ 0x082EFA98 ); \
|
||||||
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
|
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
|
||||||
const __m256i shuf_bswap32 = mm256_set2_64( \
|
M0 = mm256_bswap_32( * buf ); \
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
M1 = mm256_bswap_32( *(buf+ 1) ); \
|
||||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
M2 = mm256_bswap_32( *(buf+ 2) ); \
|
||||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
M3 = mm256_bswap_32( *(buf+ 3) ); \
|
||||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
M4 = mm256_bswap_32( *(buf+ 4) ); \
|
||||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
M5 = mm256_bswap_32( *(buf+ 5) ); \
|
||||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
M6 = mm256_bswap_32( *(buf+ 6) ); \
|
||||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
M7 = mm256_bswap_32( *(buf+ 7) ); \
|
||||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
M8 = mm256_bswap_32( *(buf+ 8) ); \
|
||||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
M9 = mm256_bswap_32( *(buf+ 9) ); \
|
||||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
MA = mm256_bswap_32( *(buf+10) ); \
|
||||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
MB = mm256_bswap_32( *(buf+11) ); \
|
||||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
MC = mm256_bswap_32( *(buf+12) ); \
|
||||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
MD = mm256_bswap_32( *(buf+13) ); \
|
||||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
ME = mm256_bswap_32( *(buf+14) ); \
|
||||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
MF = mm256_bswap_32( *(buf+15) ); \
|
||||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
|
||||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
|
||||||
ROUND_S_8WAY(0); \
|
ROUND_S_8WAY(0); \
|
||||||
ROUND_S_8WAY(1); \
|
ROUND_S_8WAY(1); \
|
||||||
ROUND_S_8WAY(2); \
|
ROUND_S_8WAY(2); \
|
||||||
@@ -1401,7 +1352,7 @@ do { \
|
|||||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
void *data )
|
void *data )
|
||||||
{
|
{
|
||||||
__m256i *M = (__m256i*)data;
|
__m256i *M = (__m256i*)data;
|
||||||
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
|||||||
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
|
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
const void *midhash, const void *data, const int rounds )
|
const void *midhash, const void *data, const int rounds )
|
||||||
{
|
{
|
||||||
__m256i *H = (__m256i*)final_hash;
|
__m256i *H = (__m256i*)final_hash;
|
||||||
@@ -1596,22 +1547,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
ROUND256_8WAY_3;
|
ROUND256_8WAY_3;
|
||||||
}
|
}
|
||||||
|
|
||||||
const __m256i shuf_bswap32 =
|
H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
|
||||||
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
|
||||||
|
H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
|
||||||
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
|
||||||
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
|
||||||
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
|
||||||
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
|
||||||
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
|
||||||
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
|
||||||
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
|
||||||
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -1933,8 +1881,6 @@ do { \
|
|||||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
|
||||||
V0 = H0; \
|
V0 = H0; \
|
||||||
V1 = H1; \
|
V1 = H1; \
|
||||||
V2 = H2; \
|
V2 = H2; \
|
||||||
@@ -1951,22 +1897,22 @@ do { \
|
|||||||
VD = v512_32( T0 ^ 0x299F31D0 ); \
|
VD = v512_32( T0 ^ 0x299F31D0 ); \
|
||||||
VE = v512_32( T1 ^ 0x082EFA98 ); \
|
VE = v512_32( T1 ^ 0x082EFA98 ); \
|
||||||
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
|
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
|
||||||
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
M0 = mm512_bswap_32( * buf ); \
|
||||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
M1 = mm512_bswap_32( *(buf+ 1) ); \
|
||||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
M2 = mm512_bswap_32( *(buf+ 2) ); \
|
||||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
M3 = mm512_bswap_32( *(buf+ 3) ); \
|
||||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
M4 = mm512_bswap_32( *(buf+ 4) ); \
|
||||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
M5 = mm512_bswap_32( *(buf+ 5) ); \
|
||||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
M6 = mm512_bswap_32( *(buf+ 6) ); \
|
||||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
M7 = mm512_bswap_32( *(buf+ 7) ); \
|
||||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
M8 = mm512_bswap_32( *(buf+ 8) ); \
|
||||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
M9 = mm512_bswap_32( *(buf+ 9) ); \
|
||||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
MA = mm512_bswap_32( *(buf+10) ); \
|
||||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
MB = mm512_bswap_32( *(buf+11) ); \
|
||||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
MC = mm512_bswap_32( *(buf+12) ); \
|
||||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
MD = mm512_bswap_32( *(buf+13) ); \
|
||||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
ME = mm512_bswap_32( *(buf+14) ); \
|
||||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
MF = mm512_bswap_32( *(buf+15) ); \
|
||||||
ROUND_S_16WAY(0); \
|
ROUND_S_16WAY(0); \
|
||||||
ROUND_S_16WAY(1); \
|
ROUND_S_16WAY(1); \
|
||||||
ROUND_S_16WAY(2); \
|
ROUND_S_16WAY(2); \
|
||||||
@@ -2063,7 +2009,7 @@ do { \
|
|||||||
// is constant for every nonce and only needs to be run once per job. The
|
// is constant for every nonce and only needs to be run once per job. The
|
||||||
// second part is run for each nonce using the precalculated midstate and the
|
// second part is run for each nonce using the precalculated midstate and the
|
||||||
// hash from the first block.
|
// hash from the first block.
|
||||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
void *data )
|
void *data )
|
||||||
{
|
{
|
||||||
__m512i *M = (__m512i*)data;
|
__m512i *M = (__m512i*)data;
|
||||||
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Dfault is 14 rounds, blakecoin & vanilla are 8.
|
// Dfault is 14 rounds, blakecoin & vanilla are 8.
|
||||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
const void *midhash, const void *data, const int rounds )
|
const void *midhash, const void *data, const int rounds )
|
||||||
{
|
{
|
||||||
__m512i *H = (__m512i*)final_hash;
|
__m512i *H = (__m512i*)final_hash;
|
||||||
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Byte swap final hash
|
// Byte swap final hash
|
||||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
|
H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
|
||||||
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
|
||||||
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
|
||||||
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
|
||||||
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
|
||||||
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
|
||||||
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
|
||||||
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
|
||||||
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Blake-256 4 way
|
// Blake-256 4 way
|
||||||
|
|
||||||
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
|
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
|
||||||
const uint32_t *salt, int rounds )
|
int rounds )
|
||||||
{
|
{
|
||||||
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
||||||
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
|
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
|
||||||
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
|
|||||||
|
|
||||||
// Blake-256 8 way
|
// Blake-256 8 way
|
||||||
|
|
||||||
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
|
||||||
const uint32_t *salt, int rounds )
|
int rounds )
|
||||||
{
|
{
|
||||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
|
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
|
||||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
|
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
|
||||||
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
__m256i *buf;
|
__m256i *buf;
|
||||||
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32 )
|
void *dst, size_t out_size_w32 )
|
||||||
{
|
{
|
||||||
__m256i buf[16];
|
__m256i buf[16];
|
||||||
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
__m256i *buf;
|
__m256i *buf;
|
||||||
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32 )
|
void *dst, size_t out_size_w32 )
|
||||||
{
|
{
|
||||||
__m256i buf[16];
|
__m256i buf[16];
|
||||||
@@ -2617,13 +2558,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//Blake-256 16 way AVX512
|
//Blake-256 16 way AVX512
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
|
||||||
const uint32_t *salt, int rounds )
|
int rounds )
|
||||||
{
|
{
|
||||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
|
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
|
||||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
|
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
|
||||||
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m512i *vdata = (__m512i*)data;
|
__m512i *vdata = (__m512i*)data;
|
||||||
__m512i *buf;
|
__m512i *buf;
|
||||||
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
|||||||
sc->ptr = ptr;
|
sc->ptr = ptr;
|
||||||
}
|
}
|
||||||
static void
|
static void
|
||||||
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32 )
|
void *dst, size_t out_size_w32 )
|
||||||
{
|
{
|
||||||
__m512i buf[16];
|
__m512i buf[16];
|
||||||
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
|
blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m512i *vdata = (__m512i*)data;
|
__m512i *vdata = (__m512i*)data;
|
||||||
__m512i *buf;
|
__m512i *buf;
|
||||||
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32 )
|
void *dst, size_t out_size_w32 )
|
||||||
{
|
{
|
||||||
__m512i buf[16];
|
__m512i buf[16];
|
||||||
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_init(void *cc)
|
blake256_16x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
blake32_16way_init( cc, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_update(void *cc, const void *data, size_t len)
|
blake256_16x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_16way(cc, data, len);
|
blake32_16way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_close(void *cc, void *dst)
|
blake256_16x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_update_le(void *cc, const void *data, size_t len)
|
blake256_16x32_update_le(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_16way_le(cc, data, len);
|
blake32_16way_le(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_close_le(void *cc, void *dst)
|
blake256_16x32_close_le(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_16way_close_le(cc, 0, 0, dst, 8);
|
blake32_16way_close_le(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake256r14_16way_init(void *cc)
|
void blake256r14_16way_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
blake32_16way_init( cc, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_16way_update(void *cc, const void *data, size_t len)
|
blake256r14_16x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_16way(cc, data, len);
|
blake32_16way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_16way_close(void *cc, void *dst)
|
blake256r14_16x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake256r8_16way_init(void *cc)
|
void blake256r8_16way_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
|
blake32_16way_init( cc, IV256, 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_16way_update(void *cc, const void *data, size_t len)
|
blake256r8_16x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_16way(cc, data, len);
|
blake32_16way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_16way_close(void *cc, void *dst)
|
blake256r8_16x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
|
|||||||
void
|
void
|
||||||
blake256_4x32_init(void *ctx)
|
blake256_4x32_init(void *ctx)
|
||||||
{
|
{
|
||||||
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
|
blake32_4x32_init( ctx, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
|
|||||||
// Blake-256 8 way
|
// Blake-256 8 way
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way_init(void *cc)
|
blake256_8x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
|
blake32_8way_init( cc, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way_update(void *cc, const void *data, size_t len)
|
blake256_8x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way_close(void *cc, void *dst)
|
blake256_8x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way_update_le(void *cc, const void *data, size_t len)
|
blake256_8x32_update_le(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way_le(cc, data, len);
|
blake32_8way_le(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way_close_le(void *cc, void *dst)
|
blake256_8x32_close_le(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_8way_close_le(cc, 0, 0, dst, 8);
|
blake32_8way_close_le(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
|
|||||||
// 14 rounds Blake, Decred
|
// 14 rounds Blake, Decred
|
||||||
void blake256r14_4x32_init(void *cc)
|
void blake256r14_4x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
|
blake32_4x32_init( cc, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
void blake256r14_8way_init(void *cc)
|
void blake256r14_8x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
|
blake32_8way_init( cc, IV256, 14 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_8way_update(void *cc, const void *data, size_t len)
|
blake256r14_8x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_8way_close(void *cc, void *dst)
|
blake256r14_8x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
|
|||||||
// 8 rounds Blakecoin, Vanilla
|
// 8 rounds Blakecoin, Vanilla
|
||||||
void blake256r8_4x32_init(void *cc)
|
void blake256r8_4x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
|
blake32_4x32_init( cc, IV256, 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)
|
|||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
void blake256r8_8way_init(void *cc)
|
void blake256r8_8x32_init(void *cc)
|
||||||
{
|
{
|
||||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
|
blake32_8way_init( cc, IV256, 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_8way_update(void *cc, const void *data, size_t len)
|
blake256r8_8x32_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_8way_close(void *cc, void *dst)
|
blake256r8_8x32_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,13 +29,6 @@ typedef struct
|
|||||||
|
|
||||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||||
const uint32_t T0, const uint32_t T1, int rounds );
|
const uint32_t T0, const uint32_t T1, int rounds );
|
||||||
/*
|
|
||||||
void blake256_init( blake256_context *sc );
|
|
||||||
void blake256_update( blake256_context *sc, const void *data, size_t len );
|
|
||||||
void blake256_close( blake256_context *sc, void *dst );
|
|
||||||
void blake256_full( blake256_context *sc, void *dst, const void *data,
|
|
||||||
size_t len );
|
|
||||||
*/
|
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
|
|||||||
void blake256_4x32_init(void *ctx);
|
void blake256_4x32_init(void *ctx);
|
||||||
void blake256_4x32_update(void *ctx, const void *data, size_t len);
|
void blake256_4x32_update(void *ctx, const void *data, size_t len);
|
||||||
void blake256_4x32_close(void *ctx, void *dst);
|
void blake256_4x32_close(void *ctx, void *dst);
|
||||||
|
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
|
void *data );
|
||||||
|
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
|
const void *midhash, const void *data, const int rounds );
|
||||||
|
|
||||||
// 14 rounds
|
// 14 rounds
|
||||||
typedef blake_4x32_small_context blake256r14_4x32_context;
|
typedef blake_4x32_small_context blake256r14_4x32_context;
|
||||||
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
|
|||||||
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
|
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r8_4x32_close(void *cc, void *dst);
|
void blake256r8_4x32_close(void *cc, void *dst);
|
||||||
|
|
||||||
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
|
||||||
void *data );
|
|
||||||
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
|
||||||
const void *midhash, const void *data, const int rounds );
|
|
||||||
|
|
||||||
#define blake_4way_small_context blake256_4x32_context
|
|
||||||
#define blake256_4way_context blake256_4x32_context
|
|
||||||
#define blake256_4way_init blake256_4x32_init
|
|
||||||
#define blake256_4way_update blake256_4x32_update
|
|
||||||
#define blake256_4way_close blake256_4x32_close
|
|
||||||
#define blake256_4way_update_le blake256_4x32_update_le
|
|
||||||
#define blake256_4way_close_le blake256_4x32_close_le
|
|
||||||
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
|
|
||||||
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
|
|
||||||
#define blake256r14_4way_context blake256r14_4x32_context
|
|
||||||
#define blake256r14_4way_init blake256r14_4x32_init
|
|
||||||
#define blake256r14_4way_update blake256r14_4x32_update
|
|
||||||
#define blake256r14_4way_close blake256r14_4x32_close
|
|
||||||
#define blake256r8_4way_context blake256r14_4x32_context
|
|
||||||
#define blake256r8_4way_init blake256r14_4x32_init
|
|
||||||
#define blake256r8_4way_update blake256r14_4x32_update
|
|
||||||
#define blake256r8_4way_close blake256r14_4x32_close
|
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
|
|
||||||
//////////////////////////////
|
//////////////////////////////
|
||||||
@@ -107,47 +81,30 @@ typedef struct
|
|||||||
} blake_8way_small_context;
|
} blake_8way_small_context;
|
||||||
|
|
||||||
// Default 14 rounds
|
// Default 14 rounds
|
||||||
typedef blake_8way_small_context blake256_8way_context;
|
typedef blake_8way_small_context blake256_8x32_context;
|
||||||
void blake256_8way_init(void *cc);
|
void blake256_8x32_init(void *cc);
|
||||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
void blake256_8x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256_8way_close(void *cc, void *dst);
|
void blake256_8x32_close(void *cc, void *dst);
|
||||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
void blake256_8x32_update_le(void *cc, const void *data, size_t len);
|
||||||
void blake256_8way_close_le(void *cc, void *dst);
|
void blake256_8x32_close_le(void *cc, void *dst);
|
||||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
void *data );
|
void *data );
|
||||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
const void *midhash, const void *data, const int rounds );
|
const void *midhash, const void *data, const int rounds );
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_8way_small_context blake256r14_8way_context;
|
typedef blake_8way_small_context blake256r14_8x32_context;
|
||||||
void blake256r14_8way_init(void *cc);
|
void blake256r14_8x32_init(void *cc);
|
||||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
void blake256r14_8x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r14_8way_close(void *cc, void *dst);
|
void blake256r14_8x32_close(void *cc, void *dst);
|
||||||
|
|
||||||
// 8 rounds, blakecoin, vanilla
|
// 8 rounds, blakecoin, vanilla
|
||||||
typedef blake_8way_small_context blake256r8_8way_context;
|
typedef blake_8way_small_context blake256r8_8x32_context;
|
||||||
void blake256r8_8way_init(void *cc);
|
void blake256r8_8x32_init(void *cc);
|
||||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
void blake256r8_8x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r8_8way_close(void *cc, void *dst);
|
void blake256r8_8x32_close(void *cc, void *dst);
|
||||||
|
|
||||||
#define blake_8x32_small_context blake256_8way_context
|
#if defined(SIMD512)
|
||||||
#define blake_8x32_init blake256_8way_init
|
|
||||||
#define blake_8x32_update blake256_8way_update
|
|
||||||
#define blake_8x32_close blake256_8way_close
|
|
||||||
#define blake_8x32_update_le blake256_8way_update_le
|
|
||||||
#define blake_8x32_close_le blake256_8way_close_le
|
|
||||||
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
|
|
||||||
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
|
|
||||||
#define blake256r14_8x32_context blake256r14_8way_context
|
|
||||||
#define blake256r14_8x32_init blake256r14_8way_init
|
|
||||||
#define blake256r14_8x32_update blake256r14_8way_update
|
|
||||||
#define blake256r14_8x32_close blake256r14_8way_close
|
|
||||||
#define blake256r8_8x32_context blake256r14_8way_context
|
|
||||||
#define blake256r8_8x32_init blake256r14_8way_init
|
|
||||||
#define blake256r8_8x32_update blake256r14_8way_update
|
|
||||||
#define blake256r8_8x32_close blake256r14_8way_close
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
||||||
|
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -163,46 +120,29 @@ typedef struct
|
|||||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
// Default 14 rounds
|
// Default 14 rounds
|
||||||
typedef blake_16way_small_context blake256_16way_context;
|
typedef blake_16way_small_context blake256_16x32_context;
|
||||||
void blake256_16way_init(void *cc);
|
void blake256_16x32_init(void *cc);
|
||||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
void blake256_16x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256_16way_close(void *cc, void *dst);
|
void blake256_16x32_close(void *cc, void *dst);
|
||||||
// Expects data in little endian order, no byte swap needed
|
// Expects data in little endian order, no byte swap needed
|
||||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
void blake256_16x32_update_le(void *cc, const void *data, size_t len);
|
||||||
void blake256_16way_close_le(void *cc, void *dst);
|
void blake256_16x32_close_le(void *cc, void *dst);
|
||||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||||
void *data );
|
void *data );
|
||||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||||
const void *midhash, const void *data, const int rounds );
|
const void *midhash, const void *data, const int rounds );
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_16way_small_context blake256r14_16way_context;
|
typedef blake_16way_small_context blake256r14_16x32_context;
|
||||||
void blake256r14_16way_init(void *cc);
|
void blake256r14_16x32_init(void *cc);
|
||||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
void blake256r14_16x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r14_16way_close(void *cc, void *dst);
|
void blake256r14_16x32_close(void *cc, void *dst);
|
||||||
|
|
||||||
// 8 rounds, blakecoin, vanilla
|
// 8 rounds, blakecoin, vanilla
|
||||||
typedef blake_16way_small_context blake256r8_16way_context;
|
typedef blake_16way_small_context blake256r8_16x32_context;
|
||||||
void blake256r8_16way_init(void *cc);
|
void blake256r8_16x32_init(void *cc);
|
||||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
void blake256r8_16x32_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r8_16way_close(void *cc, void *dst);
|
void blake256r8_16x32_close(void *cc, void *dst);
|
||||||
|
|
||||||
#define blake_16x32_small_context blake256_16way_context
|
|
||||||
#define blake_16x32_init blake256_16way_init
|
|
||||||
#define blake_16x32_update blake256_16way_update
|
|
||||||
#define blake_16x32_close blake256_16way_close
|
|
||||||
#define blake_16x32_update_le blake256_16way_update_le
|
|
||||||
#define blake_16x32_close_le blake256_16way_close_le
|
|
||||||
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
|
|
||||||
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
|
|
||||||
#define blake256r14_16x32_context blake256r14_16way_context
|
|
||||||
#define blake256r14_16x32_init blake256r14_16way_init
|
|
||||||
#define blake256r14_16x32_update blake256r14_16way_update
|
|
||||||
#define blake256r14_16x32_close blake256r14_16way_close
|
|
||||||
#define blake256r8_16x32_context blake256r8_16way_context
|
|
||||||
#define blake256r8_16x32_init blake256r8_16way_init
|
|
||||||
#define blake256r8_16x32_update blake256r8_16way_update
|
|
||||||
#define blake256r8_16x32_close blake256r8_16way_close
|
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
|
|||||||
#define Mx_(n) Mx__(n)
|
#define Mx_(n) Mx__(n)
|
||||||
#define Mx__(n) M ## n
|
#define Mx__(n) M ## n
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define B2B8W_G(a, b, c, d, x, y) \
|
#define B2B8W_G(a, b, c, d, x, y) \
|
||||||
{ \
|
{ \
|
||||||
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
|
|||||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
|
||||||
{
|
{
|
||||||
__m512i v[16], m[16];
|
__m512i v[16], m[16];
|
||||||
|
|
||||||
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
|||||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||||
size_t inlen )
|
size_t inlen )
|
||||||
{
|
{
|
||||||
__m512i* in =(__m512i*)input;
|
__m512i* in =(__m512i*)input;
|
||||||
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
|||||||
ctx->t[0] += ctx->c;
|
ctx->t[0] += ctx->c;
|
||||||
if ( ctx->t[0] < ctx->c )
|
if ( ctx->t[0] < ctx->c )
|
||||||
ctx->t[1]++;
|
ctx->t[1]++;
|
||||||
blake2b_8way_compress( ctx, 0 );
|
blake2b_8x64_compress( ctx, 0 );
|
||||||
ctx->c = 0;
|
ctx->c = 0;
|
||||||
}
|
}
|
||||||
ctx->b[ c++ ] = in[i];
|
ctx->b[ c++ ] = in[i];
|
||||||
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
|
||||||
{
|
{
|
||||||
size_t c;
|
size_t c;
|
||||||
c = ctx->c >> 3;
|
c = ctx->c >> 3;
|
||||||
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
|||||||
ctx->c += 8;
|
ctx->c += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
|
||||||
|
|
||||||
casti_m512i( out, 0 ) = ctx->h[0];
|
casti_m512i( out, 0 ) = ctx->h[0];
|
||||||
casti_m512i( out, 1 ) = ctx->h[1];
|
casti_m512i( out, 1 ) = ctx->h[1];
|
||||||
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
|
||||||
{
|
{
|
||||||
__m256i v[16], m[16];
|
__m256i v[16], m[16];
|
||||||
|
|
||||||
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
|||||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||||
size_t inlen )
|
size_t inlen )
|
||||||
{
|
{
|
||||||
__m256i* in =(__m256i*)input;
|
__m256i* in =(__m256i*)input;
|
||||||
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
|||||||
ctx->t[0] += ctx->c;
|
ctx->t[0] += ctx->c;
|
||||||
if ( ctx->t[0] < ctx->c )
|
if ( ctx->t[0] < ctx->c )
|
||||||
ctx->t[1]++;
|
ctx->t[1]++;
|
||||||
blake2b_4way_compress( ctx, 0 );
|
blake2b_4x64_compress( ctx, 0 );
|
||||||
ctx->c = 0;
|
ctx->c = 0;
|
||||||
}
|
}
|
||||||
ctx->b[ c++ ] = in[i];
|
ctx->b[ c++ ] = in[i];
|
||||||
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
|
||||||
{
|
{
|
||||||
size_t c;
|
size_t c;
|
||||||
c = ctx->c >> 3;
|
c = ctx->c >> 3;
|
||||||
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
|||||||
ctx->c += 8;
|
ctx->c += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
|
||||||
|
|
||||||
casti_m256i( out, 0 ) = ctx->h[0];
|
casti_m256i( out, 0 ) = ctx->h[0];
|
||||||
casti_m256i( out, 1 ) = ctx->h[1];
|
casti_m256i( out, 1 ) = ctx->h[1];
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
#ifndef BLAKE2B_HASH_4WAY_H__
|
||||||
#define __BLAKE2B_HASH_4WAY_H__
|
#define BLAKE2B_HASH_4WAY_H__
|
||||||
|
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
@@ -14,8 +14,7 @@
|
|||||||
#define ALIGN(x) __attribute__((aligned(x)))
|
#define ALIGN(x) __attribute__((aligned(x)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(SIMD512)
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) {
|
typedef struct ALIGN( 64 ) {
|
||||||
__m512i b[16]; // input buffer
|
__m512i b[16]; // input buffer
|
||||||
@@ -23,12 +22,12 @@ typedef struct ALIGN( 64 ) {
|
|||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
size_t c; // pointer for b[]
|
size_t c; // pointer for b[]
|
||||||
size_t outlen; // digest size
|
size_t outlen; // digest size
|
||||||
} blake2b_8way_ctx;
|
} blake2b_8x64_ctx;
|
||||||
|
|
||||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
|
||||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||||
size_t inlen );
|
size_t inlen );
|
||||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -41,12 +40,12 @@ typedef struct ALIGN( 64 ) {
|
|||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
size_t c; // pointer for b[]
|
size_t c; // pointer for b[]
|
||||||
size_t outlen; // digest size
|
size_t outlen; // digest size
|
||||||
} blake2b_4way_ctx;
|
} blake2b_4x64_ctx;
|
||||||
|
|
||||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
|
||||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||||
size_t inlen );
|
size_t inlen );
|
||||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "blake2b-hash.h"
|
#include "blake2b-hash.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKE2B_8WAY
|
#define BLAKE2B_8WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKE2B_4WAY
|
#define BLAKE2B_4WAY
|
||||||
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
|
||||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
|||||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||||
|
|
||||||
blake2b_8way_init( &ctx );
|
blake2b_8x64_init( &ctx );
|
||||||
blake2b_8way_update( &ctx, vdata, 80 );
|
blake2b_8x64_update( &ctx, vdata, 80 );
|
||||||
blake2b_8way_final( &ctx, hash );
|
blake2b_8x64_final( &ctx, hash );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
if ( hash7[ lane<<1 ] <= Htarg )
|
if ( hash7[ lane<<1 ] <= Htarg )
|
||||||
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
|||||||
// Function not used, code inlined.
|
// Function not used, code inlined.
|
||||||
void blake2b_4way_hash(void *output, const void *input)
|
void blake2b_4way_hash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
blake2b_4way_ctx ctx;
|
blake2b_4x64_ctx ctx;
|
||||||
blake2b_4way_init( &ctx );
|
blake2b_4x64_init( &ctx );
|
||||||
blake2b_4way_update( &ctx, input, 80 );
|
blake2b_4x64_update( &ctx, input, 80 );
|
||||||
blake2b_4way_final( &ctx, output );
|
blake2b_4x64_final( &ctx, output );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
|
||||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
|||||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
blake2b_4way_init( &ctx );
|
blake2b_4x64_init( &ctx );
|
||||||
blake2b_4way_update( &ctx, vdata, 80 );
|
blake2b_4x64_update( &ctx, vdata, 80 );
|
||||||
blake2b_4way_final( &ctx, hash );
|
blake2b_4x64_final( &ctx, hash );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 4; lane++ )
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
if ( hash7[ lane<<1 ] <= Htarg )
|
if ( hash7[ lane<<1 ] <= Htarg )
|
||||||
|
|||||||
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Blake2s-256 16 way
|
// Blake2s-256 16 way
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,8 @@
|
|||||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||||
*/
|
*/
|
||||||
//#pragma once
|
//#pragma once
|
||||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
#ifndef BLAKE2S_HASH_4WAY_H__
|
||||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
#define BLAKE2S_HASH_4WAY_H__ 1
|
||||||
|
|
||||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
@@ -61,13 +61,18 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
|||||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||||
const void *input, uint64_t inlen );
|
const void *input, uint64_t inlen );
|
||||||
|
|
||||||
|
#define blake2s_4x32_state blake2s_4way_state
|
||||||
|
#define blake2s_4x32_init blake2s_4way_init
|
||||||
|
#define blake2s_4x32_update blake2s_4way_update
|
||||||
|
#define blake2s_4x32_final blake2s_4way_final
|
||||||
|
#define blake2s_4x32_full_blocks blake2s_4way_full_blocks
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||||
{
|
{
|
||||||
__m256i h[8];
|
__m256i h[8];
|
||||||
uint8_t buf[ 32 * 8 ];
|
uint8_t buf[ 64 * 8 ];
|
||||||
uint32_t t[2];
|
uint32_t t[2];
|
||||||
uint32_t f[2];
|
uint32_t f[2];
|
||||||
size_t buflen;
|
size_t buflen;
|
||||||
@@ -81,14 +86,20 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
|||||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||||
const void *input, uint64_t inlen );
|
const void *input, uint64_t inlen );
|
||||||
|
|
||||||
|
#define blake2s_8x32_state blake2s_8way_state
|
||||||
|
#define blake2s_8x32_init blake2s_8way_init
|
||||||
|
#define blake2s_8x32_update blake2s_8way_update
|
||||||
|
#define blake2s_8x32_final blake2s_8way_final
|
||||||
|
#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||||
{
|
{
|
||||||
__m512i h[8];
|
__m512i h[8];
|
||||||
uint8_t buf[ 32 * 16 ];
|
uint8_t buf[ 64 * 16 ];
|
||||||
uint32_t t[2];
|
uint32_t t[2];
|
||||||
uint32_t f[2];
|
uint32_t f[2];
|
||||||
size_t buflen;
|
size_t buflen;
|
||||||
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
|||||||
uint64_t inlen );
|
uint64_t inlen );
|
||||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||||
|
|
||||||
|
#define blake2s_16x32_state blake2s_16way_state
|
||||||
|
#define blake2s_16x32_init blake2s_16way_init
|
||||||
|
#define blake2s_16x32_update blake2s_16way_update
|
||||||
|
#define blake2s_16x32_final blake2s_16way_final
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKE2S_16WAY
|
#define BLAKE2S_16WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKE2S_8WAY
|
#define BLAKE2S_8WAY
|
||||||
|
|||||||
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
|||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
|
Vb = v128_ror64xor( Vb, Vc, 25 ); \
|
||||||
\
|
\
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
|
Vb = v128_ror64xor( Vb, Vc, 11 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLAKE512_ROUND( R ) \
|
#define BLAKE512_ROUND( R ) \
|
||||||
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
|||||||
VD = v512_64( CB5 ^ T0 ); \
|
VD = v512_64( CB5 ^ T0 ); \
|
||||||
VE = v512_64( CB6 ^ T1 ); \
|
VE = v512_64( CB6 ^ T1 ); \
|
||||||
VF = v512_64( CB7 ^ T1 ); \
|
VF = v512_64( CB7 ^ T1 ); \
|
||||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
|
M0 = mm512_bswap_64( *(buf+ 0) ); \
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
M1 = mm512_bswap_64( *(buf+ 1) ); \
|
||||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
M2 = mm512_bswap_64( *(buf+ 2) ); \
|
||||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
M3 = mm512_bswap_64( *(buf+ 3) ); \
|
||||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
M4 = mm512_bswap_64( *(buf+ 4) ); \
|
||||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
M5 = mm512_bswap_64( *(buf+ 5) ); \
|
||||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
M6 = mm512_bswap_64( *(buf+ 6) ); \
|
||||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
M7 = mm512_bswap_64( *(buf+ 7) ); \
|
||||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
M8 = mm512_bswap_64( *(buf+ 8) ); \
|
||||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
M9 = mm512_bswap_64( *(buf+ 9) ); \
|
||||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
MA = mm512_bswap_64( *(buf+10) ); \
|
||||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
MB = mm512_bswap_64( *(buf+11) ); \
|
||||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
MC = mm512_bswap_64( *(buf+12) ); \
|
||||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
MD = mm512_bswap_64( *(buf+13) ); \
|
||||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
ME = mm512_bswap_64( *(buf+14) ); \
|
||||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
MF = mm512_bswap_64( *(buf+15) ); \
|
||||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
|
||||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
|
||||||
ROUND_B_8WAY(0); \
|
ROUND_B_8WAY(0); \
|
||||||
ROUND_B_8WAY(1); \
|
ROUND_B_8WAY(1); \
|
||||||
ROUND_B_8WAY(2); \
|
ROUND_B_8WAY(2); \
|
||||||
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
|||||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
void blake512_8x64_compress( blake_8x64_big_context *sc )
|
||||||
{
|
{
|
||||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
VE = v512_64( CB6 ^ sc->T1 );
|
VE = v512_64( CB6 ^ sc->T1 );
|
||||||
VF = v512_64( CB7 ^ sc->T1 );
|
VF = v512_64( CB7 ^ sc->T1 );
|
||||||
|
|
||||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64(
|
M0 = mm512_bswap_64( sc->buf[ 0] );
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
M1 = mm512_bswap_64( sc->buf[ 1] );
|
||||||
|
M2 = mm512_bswap_64( sc->buf[ 2] );
|
||||||
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
M3 = mm512_bswap_64( sc->buf[ 3] );
|
||||||
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
M4 = mm512_bswap_64( sc->buf[ 4] );
|
||||||
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
M5 = mm512_bswap_64( sc->buf[ 5] );
|
||||||
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
M6 = mm512_bswap_64( sc->buf[ 6] );
|
||||||
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
M7 = mm512_bswap_64( sc->buf[ 7] );
|
||||||
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
M8 = mm512_bswap_64( sc->buf[ 8] );
|
||||||
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
M9 = mm512_bswap_64( sc->buf[ 9] );
|
||||||
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
MA = mm512_bswap_64( sc->buf[10] );
|
||||||
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
MB = mm512_bswap_64( sc->buf[11] );
|
||||||
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
MC = mm512_bswap_64( sc->buf[12] );
|
||||||
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
MD = mm512_bswap_64( sc->buf[13] );
|
||||||
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
ME = mm512_bswap_64( sc->buf[14] );
|
||||||
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
MF = mm512_bswap_64( sc->buf[15] );
|
||||||
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
|
||||||
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
|
||||||
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
|
||||||
|
|
||||||
ROUND_B_8WAY(0);
|
ROUND_B_8WAY(0);
|
||||||
ROUND_B_8WAY(1);
|
ROUND_B_8WAY(1);
|
||||||
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
|||||||
}
|
}
|
||||||
|
|
||||||
// won't be used after prehash implemented
|
// won't be used after prehash implemented
|
||||||
void blake512_8way_compress_le( blake_8x64_big_context *sc )
|
void blake512_8x64_compress_le( blake_8x64_big_context *sc )
|
||||||
{
|
{
|
||||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
|
|||||||
{
|
{
|
||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
blake512_8way_compress( sc );
|
blake512_8x64_compress( sc );
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
|
|||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
|
|
||||||
blake512_8way_compress( sc );
|
blake512_8x64_compress( sc );
|
||||||
|
|
||||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
|
|||||||
{
|
{
|
||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
blake512_8way_compress_le( sc );
|
blake512_8x64_compress_le( sc );
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
|
|||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
|
|
||||||
blake512_8way_compress_le( sc );
|
blake512_8x64_compress_le( sc );
|
||||||
|
|
||||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
|
|||||||
VD = v256_64( CB5 ^ T0 ); \
|
VD = v256_64( CB5 ^ T0 ); \
|
||||||
VE = v256_64( CB6 ^ T1 ); \
|
VE = v256_64( CB6 ^ T1 ); \
|
||||||
VF = v256_64( CB7 ^ T1 ); \
|
VF = v256_64( CB7 ^ T1 ); \
|
||||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
|
M0 = mm256_bswap_64( *(buf+ 0) ); \
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
M1 = mm256_bswap_64( *(buf+ 1) ); \
|
||||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
M2 = mm256_bswap_64( *(buf+ 2) ); \
|
||||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
M3 = mm256_bswap_64( *(buf+ 3) ); \
|
||||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
M4 = mm256_bswap_64( *(buf+ 4) ); \
|
||||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
M5 = mm256_bswap_64( *(buf+ 5) ); \
|
||||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
M6 = mm256_bswap_64( *(buf+ 6) ); \
|
||||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
M7 = mm256_bswap_64( *(buf+ 7) ); \
|
||||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
M8 = mm256_bswap_64( *(buf+ 8) ); \
|
||||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
M9 = mm256_bswap_64( *(buf+ 9) ); \
|
||||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
MA = mm256_bswap_64( *(buf+10) ); \
|
||||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
MB = mm256_bswap_64( *(buf+11) ); \
|
||||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
MC = mm256_bswap_64( *(buf+12) ); \
|
||||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
MD = mm256_bswap_64( *(buf+13) ); \
|
||||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
ME = mm256_bswap_64( *(buf+14) ); \
|
||||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
MF = mm256_bswap_64( *(buf+15) ); \
|
||||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
|
||||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
|
||||||
ROUND_B_4WAY(0); \
|
ROUND_B_4WAY(0); \
|
||||||
ROUND_B_4WAY(1); \
|
ROUND_B_4WAY(1); \
|
||||||
ROUND_B_4WAY(2); \
|
ROUND_B_4WAY(2); \
|
||||||
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void blake512_4way_compress( blake_4x64_big_context *sc )
|
void blake512_4x64_compress( blake_4x64_big_context *sc )
|
||||||
{
|
{
|
||||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||||
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
|
|||||||
VD = v256_64( CB5 ^ sc->T0 );
|
VD = v256_64( CB5 ^ sc->T0 );
|
||||||
VE = v256_64( CB6 ^ sc->T1 );
|
VE = v256_64( CB6 ^ sc->T1 );
|
||||||
VF = v256_64( CB7 ^ sc->T1 );
|
VF = v256_64( CB7 ^ sc->T1 );
|
||||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
|
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
|
||||||
|
|
||||||
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
M0 = mm256_bswap_64( sc->buf[ 0] );
|
||||||
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
M1 = mm256_bswap_64( sc->buf[ 1] );
|
||||||
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
M2 = mm256_bswap_64( sc->buf[ 2] );
|
||||||
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
M3 = mm256_bswap_64( sc->buf[ 3] );
|
||||||
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
M4 = mm256_bswap_64( sc->buf[ 4] );
|
||||||
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
M5 = mm256_bswap_64( sc->buf[ 5] );
|
||||||
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
M6 = mm256_bswap_64( sc->buf[ 6] );
|
||||||
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
M7 = mm256_bswap_64( sc->buf[ 7] );
|
||||||
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
M8 = mm256_bswap_64( sc->buf[ 8] );
|
||||||
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
M9 = mm256_bswap_64( sc->buf[ 9] );
|
||||||
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
MA = mm256_bswap_64( sc->buf[10] );
|
||||||
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
MB = mm256_bswap_64( sc->buf[11] );
|
||||||
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
MC = mm256_bswap_64( sc->buf[12] );
|
||||||
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
MD = mm256_bswap_64( sc->buf[13] );
|
||||||
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
ME = mm256_bswap_64( sc->buf[14] );
|
||||||
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
MF = mm256_bswap_64( sc->buf[15] );
|
||||||
|
|
||||||
ROUND_B_4WAY(0);
|
ROUND_B_4WAY(0);
|
||||||
ROUND_B_4WAY(1);
|
ROUND_B_4WAY(1);
|
||||||
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
|
|||||||
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
|
void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
|
||||||
const void *data )
|
const void *data )
|
||||||
{
|
{
|
||||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||||
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
|
|||||||
midstate[15] = VF;
|
midstate[15] = VF;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
|
||||||
const __m256i nonce, const __m256i *midstate )
|
const __m256i nonce, const __m256i *midstate )
|
||||||
{
|
{
|
||||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||||
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void blake512_4x64_init( blake_4x64_big_context *sc )
|
void blake512_4x64_init( blake512_4x64_context *sc )
|
||||||
{
|
{
|
||||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||||
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
|
|||||||
}
|
}
|
||||||
|
|
||||||
// init, update & close
|
// init, update & close
|
||||||
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
|
||||||
const void *data, size_t len )
|
const void *data, size_t len )
|
||||||
{
|
{
|
||||||
|
|
||||||
@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
|||||||
{
|
{
|
||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
blake512_4way_compress( sc );
|
blake512_4x64_compress( sc );
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
|||||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||||
sc->T1 = sc->T1 + 1;
|
sc->T1 = sc->T1 + 1;
|
||||||
|
|
||||||
blake512_4way_compress( sc );
|
blake512_4x64_compress( sc );
|
||||||
|
|
||||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
@@ -1887,13 +1878,13 @@ blake512_4x64_close(void *cc, void *dst)
|
|||||||
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 32 ); \
|
d = v128_ror64xor( d, a, 32 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 25 ); \
|
b = v128_ror64xor( b, c, 25 ); \
|
||||||
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
d = v128_ror64xor( d, a, 16 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 11 ); \
|
b = v128_ror64xor( b, c, 11 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ROUND_B_2X64(r) \
|
#define ROUND_B_2X64(r) \
|
||||||
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
|
|||||||
VE = v128_64( CB6 ^ sc->T1 );
|
VE = v128_64( CB6 ^ sc->T1 );
|
||||||
VF = v128_64( CB7 ^ sc->T1 );
|
VF = v128_64( CB7 ^ sc->T1 );
|
||||||
|
|
||||||
#if defined(__SSSE3__)
|
|
||||||
|
|
||||||
const v128u64_t shuf_bswap64 = v128_set64(
|
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
|
||||||
M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
|
|
||||||
M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
|
|
||||||
M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
|
|
||||||
M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
|
|
||||||
M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
|
|
||||||
M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
|
|
||||||
M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
|
|
||||||
M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
|
|
||||||
M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
|
|
||||||
M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
|
|
||||||
MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
|
|
||||||
MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
|
|
||||||
MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
|
|
||||||
MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
|
|
||||||
ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
|
|
||||||
MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
|
|
||||||
|
|
||||||
#else // SSE2 & NEON
|
|
||||||
|
|
||||||
M0 = v128_bswap64( sc->buf[ 0] );
|
M0 = v128_bswap64( sc->buf[ 0] );
|
||||||
M1 = v128_bswap64( sc->buf[ 1] );
|
M1 = v128_bswap64( sc->buf[ 1] );
|
||||||
M2 = v128_bswap64( sc->buf[ 2] );
|
M2 = v128_bswap64( sc->buf[ 2] );
|
||||||
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
|
|||||||
ME = v128_bswap64( sc->buf[14] );
|
ME = v128_bswap64( sc->buf[14] );
|
||||||
MF = v128_bswap64( sc->buf[15] );
|
MF = v128_bswap64( sc->buf[15] );
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ROUND_B_2X64(0);
|
ROUND_B_2X64(0);
|
||||||
ROUND_B_2X64(1);
|
ROUND_B_2X64(1);
|
||||||
ROUND_B_2X64(2);
|
ROUND_B_2X64(2);
|
||||||
@@ -2054,9 +2020,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
|
|||||||
// G4 skip nonce
|
// G4 skip nonce
|
||||||
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
||||||
V0 );
|
V0 );
|
||||||
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
|
VF = v128_ror64xor( VF, V0, 32 );
|
||||||
VA = v128_add64( VA, VF );
|
VA = v128_add64( VA, VF );
|
||||||
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
|
V5 = v128_ror64xor( V5, VA, 25 );
|
||||||
V0 = v128_add64( V0, V5 );
|
V0 = v128_add64( V0, V5 );
|
||||||
|
|
||||||
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||||
@@ -2137,9 +2103,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
|||||||
|
|
||||||
// finish round 0, with the nonce now available
|
// finish round 0, with the nonce now available
|
||||||
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
||||||
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
|
VF = v128_ror64xor( VF, V0, 16 );
|
||||||
VA = v128_add64( VA, VF );
|
VA = v128_add64( VA, VF );
|
||||||
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
|
V5 = v128_ror64xor( V5, VA, 11 );
|
||||||
|
|
||||||
// Round 1
|
// Round 1
|
||||||
// G0
|
// G0
|
||||||
@@ -2147,34 +2113,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
|||||||
|
|
||||||
// G1
|
// G1
|
||||||
V1 = v128_add64( V1, V5 );
|
V1 = v128_add64( V1, V5 );
|
||||||
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
|
VD = v128_ror64xor( VD, V1, 32 );
|
||||||
V9 = v128_add64( V9, VD );
|
V9 = v128_add64( V9, VD );
|
||||||
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
|
V5 = v128_ror64xor( V5, V9, 25 );
|
||||||
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
||||||
V5 ) );
|
V5 ) );
|
||||||
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
|
VD = v128_ror64xor( VD, V1, 16 );
|
||||||
V9 = v128_add64( V9, VD );
|
V9 = v128_add64( V9, VD );
|
||||||
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
|
V5 = v128_ror64xor( V5, V9, 11 );
|
||||||
|
|
||||||
// G2
|
// G2
|
||||||
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
||||||
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
|
VE = v128_ror64xor( VE, V2, 32 );
|
||||||
VA = v128_add64( VA, VE );
|
VA = v128_add64( VA, VE );
|
||||||
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
|
V6 = v128_ror64xor( V6, VA, 25 );
|
||||||
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
||||||
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
|
VE = v128_ror64xor( VE, V2, 16 );
|
||||||
VA = v128_add64( VA, VE );
|
VA = v128_add64( VA, VE );
|
||||||
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
|
V6 = v128_ror64xor( V6, VA, 11 );
|
||||||
|
|
||||||
// G3
|
// G3
|
||||||
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
|
VF = v128_ror64xor( VF, V3, 32 );
|
||||||
VB = v128_add64( VB, VF );
|
VB = v128_add64( VB, VF );
|
||||||
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
|
V7 = v128_ror64xor( V7, VB, 25 );
|
||||||
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
||||||
V7 ) );
|
V7 ) );
|
||||||
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
|
VF = v128_ror64xor( VF, V3, 16 );
|
||||||
VB = v128_add64( VB, VF );
|
VB = v128_add64( VB, VF );
|
||||||
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
|
V7 = v128_ror64xor( V7, VB, 11 );
|
||||||
|
|
||||||
// G4, G5, G6, G7
|
// G4, G5, G6, G7
|
||||||
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
|||||||
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
||||||
#define blake512_4way_final_le blake512_4x64_final_le
|
#define blake512_4way_final_le blake512_4x64_final_le
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
|||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||||
block_buf, rounds );
|
block_buf, rounds );
|
||||||
for ( int lane = 0; lane < 16; lane++ )
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
|||||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||||
block_buf, rounds );
|
block_buf, rounds );
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||||
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#elif defined (BLAKECOIN_4WAY)
|
#elif defined (BLAKECOIN_4WAY)
|
||||||
|
|
||||||
blake256r8_4way_context blakecoin_4w_ctx;
|
blake256r8_4x32_context blakecoin_4w_ctx;
|
||||||
|
|
||||||
void blakecoin_4way_hash(void *state, const void *input)
|
void blakecoin_4way_hash(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
blake256r8_4way_context ctx;
|
blake256r8_4x32_context ctx;
|
||||||
|
|
||||||
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
|
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
|
||||||
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
|
blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
|
||||||
blake256r8_4way_close( &ctx, vhash );
|
blake256r8_4x32_close( &ctx, vhash );
|
||||||
|
|
||||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||||
}
|
}
|
||||||
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
|||||||
HTarget = 0x7f;
|
HTarget = 0x7f;
|
||||||
|
|
||||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
blake256r8_4x32_init( &blakecoin_4w_ctx );
|
||||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKECOIN_16WAY
|
#define BLAKECOIN_16WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKECOIN_8WAY
|
#define BLAKECOIN_8WAY
|
||||||
|
|||||||
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
|
|||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
blake512_4way_context ctx;
|
blake512_4x64_context ctx;
|
||||||
|
|
||||||
|
blake512_4x64_init( &ctx );
|
||||||
|
blake512_4x64_update( &ctx, input, 80 );
|
||||||
|
blake512_4x64_close( &ctx, vhash );
|
||||||
|
|
||||||
blake512_4way_init( &ctx );
|
blake512_4x64_init( &ctx );
|
||||||
blake512_4way_update( &ctx, input, 80 );
|
blake512_4x64_update( &ctx, vhash, 64 );
|
||||||
blake512_4way_close( &ctx, vhash );
|
blake512_4x64_close( &ctx, vhash );
|
||||||
|
|
||||||
blake512_4way_init( &ctx );
|
blake512_4x64_init( &ctx );
|
||||||
blake512_4way_update( &ctx, vhash, 64 );
|
blake512_4x64_update( &ctx, vhash, 64 );
|
||||||
blake512_4way_close( &ctx, vhash );
|
blake512_4x64_close( &ctx, vhash );
|
||||||
|
|
||||||
blake512_4way_init( &ctx );
|
blake512_4x64_init( &ctx );
|
||||||
blake512_4way_update( &ctx, vhash, 64 );
|
blake512_4x64_update( &ctx, vhash, 64 );
|
||||||
blake512_4way_close( &ctx, vhash );
|
blake512_4x64_close( &ctx, vhash );
|
||||||
|
|
||||||
blake512_4way_init( &ctx );
|
blake512_4x64_init( &ctx );
|
||||||
blake512_4way_update( &ctx, vhash, 64 );
|
blake512_4x64_update( &ctx, vhash, 64 );
|
||||||
blake512_4way_close( &ctx, vhash );
|
blake512_4x64_close( &ctx, vhash );
|
||||||
|
|
||||||
blake512_4way_init( &ctx );
|
|
||||||
blake512_4way_update( &ctx, vhash, 64 );
|
|
||||||
blake512_4way_close( &ctx, vhash );
|
|
||||||
|
|
||||||
memcpy( output, hash0, 32 );
|
memcpy( output, hash0, 32 );
|
||||||
memcpy( output+32, hash1, 32 );
|
memcpy( output+32, hash1, 32 );
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
|||||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
v128_t *V = (v128_t*)v;
|
v128_t *V = (v128_t*)v;
|
||||||
|
|
||||||
@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
|||||||
V[3] = v128_swap64( V[3] ); \
|
V[3] = v128_swap64( V[3] ); \
|
||||||
V[2] = v128_shufll32( V[2] )
|
V[2] = v128_shufll32( V[2] )
|
||||||
|
|
||||||
BLAKE2S_ROUND(0);
|
|
||||||
BLAKE2S_ROUND(1);
|
|
||||||
BLAKE2S_ROUND(2);
|
|
||||||
BLAKE2S_ROUND(3);
|
|
||||||
BLAKE2S_ROUND(4);
|
|
||||||
BLAKE2S_ROUND(5);
|
|
||||||
BLAKE2S_ROUND(6);
|
|
||||||
BLAKE2S_ROUND(7);
|
|
||||||
BLAKE2S_ROUND(8);
|
|
||||||
BLAKE2S_ROUND(9);
|
|
||||||
|
|
||||||
#undef BLAKE2S_ROUND
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define G(r,i,a,b,c,d) \
|
#define G(r,i,a,b,c,d) \
|
||||||
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
|||||||
b = SPH_ROTR32(b ^ c, 7); \
|
b = SPH_ROTR32(b ^ c, 7); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define ROUND(r) \
|
#define BLAKE2S_ROUND(r) \
|
||||||
do { \
|
do { \
|
||||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
||||||
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
|||||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
ROUND( 0 );
|
|
||||||
ROUND( 1 );
|
|
||||||
ROUND( 2 );
|
|
||||||
ROUND( 3 );
|
|
||||||
ROUND( 4 );
|
|
||||||
ROUND( 5 );
|
|
||||||
ROUND( 6 );
|
|
||||||
ROUND( 7 );
|
|
||||||
ROUND( 8 );
|
|
||||||
ROUND( 9 );
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
BLAKE2S_ROUND(0);
|
||||||
|
BLAKE2S_ROUND(1);
|
||||||
|
BLAKE2S_ROUND(2);
|
||||||
|
BLAKE2S_ROUND(3);
|
||||||
|
BLAKE2S_ROUND(4);
|
||||||
|
BLAKE2S_ROUND(5);
|
||||||
|
BLAKE2S_ROUND(6);
|
||||||
|
BLAKE2S_ROUND(7);
|
||||||
|
BLAKE2S_ROUND(8);
|
||||||
|
BLAKE2S_ROUND(9);
|
||||||
|
|
||||||
|
|
||||||
for( size_t i = 0; i < 8; ++i )
|
for( size_t i = 0; i < 8; ++i )
|
||||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||||
|
|
||||||
#undef G
|
#undef G
|
||||||
#undef ROUND
|
#undef BLAKE2S_ROUND
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -101,15 +101,15 @@
|
|||||||
{ \
|
{ \
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
|
Vb = v128_ror64xor( Vb, Vc, 24 ); \
|
||||||
\
|
\
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
Vb = v128_ror64xor( Vb, Vc, 63 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLAKE2B_ROUND( R ) \
|
#define BLAKE2B_ROUND( R ) \
|
||||||
|
|||||||
@@ -39,16 +39,14 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#define SPH_SIZE_bmw256 256
|
|
||||||
|
|
||||||
#define SPH_SIZE_bmw512 512
|
|
||||||
|
|
||||||
// BMW-256 4 way 32
|
// BMW-256 4 way 32
|
||||||
|
|
||||||
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
v128_t buf[64];
|
v128u32_t buf[64];
|
||||||
v128_t H[16];
|
v128u32_t H[16];
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||||
} bmw_4way_small_context;
|
} bmw_4way_small_context;
|
||||||
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
|
|||||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||||
|
|
||||||
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||||
#define bmw256_4way bmw256_4way_update
|
|
||||||
|
|
||||||
void bmw256_4way_close(void *cc, void *dst);
|
void bmw256_4way_close(void *cc, void *dst);
|
||||||
|
|
||||||
void bmw256_4way_addbits_and_close(
|
void bmw256_4way_addbits_and_close(
|
||||||
void *cc, unsigned ub, unsigned n, void *dst);
|
void *cc, unsigned ub, unsigned n, void *dst);
|
||||||
|
|
||||||
|
#define bmw256_4x32_context bmw256_4way_context
|
||||||
|
#define bmw256_4x32_init bmw256_4way_init
|
||||||
|
#define bmw256_4x32_update bmw256_4way_update
|
||||||
|
#define bmw256_4x32_close bmw256_4way_close
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
// BMW-256 8 way 32
|
// BMW-256 8 way 32
|
||||||
@@ -85,9 +89,14 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
|||||||
#define bmw256_8way bmw256_8way_update
|
#define bmw256_8way bmw256_8way_update
|
||||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||||
|
|
||||||
|
#define bmw256_8x32_context bmw256_8way_context
|
||||||
|
#define bmw256_8x32_init bmw256_8way_init
|
||||||
|
#define bmw256_8x32_update bmw256_8way_update
|
||||||
|
#define bmw256_8x32_close bmw256_8way_close
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-256 16 way 32
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
|||||||
size_t len );
|
size_t len );
|
||||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||||
|
|
||||||
|
#define bmw256_16x32_context bmw256_16way_context
|
||||||
|
#define bmw256_16x32_init bmw256_16way_init
|
||||||
|
#define bmw256_16x32_update bmw256_16way_update
|
||||||
|
#define bmw256_16x32_close bmw256_16way_close
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// BMW-512 2 way 64
|
// BMW-512 2 way 64
|
||||||
@@ -157,7 +171,7 @@ void bmw512_4way_addbits_and_close(
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-512 64 bit 8 way
|
// BMW-512 64 bit 8 way
|
||||||
typedef struct
|
typedef struct
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ extern "C"{
|
|||||||
|
|
||||||
#define LPAR (
|
#define LPAR (
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
// BMW-256 4 way 32
|
// BMW-256 4 way 32
|
||||||
/*
|
/*
|
||||||
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
|
|||||||
v128_xor( M[13], H[13] ) ) )
|
v128_xor( M[13], H[13] ) ) )
|
||||||
|
|
||||||
|
|
||||||
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
|
void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
|
||||||
{
|
{
|
||||||
v128u64_t qt[32], xl, xh; \
|
v128u32_t qt[32], xl, xh; \
|
||||||
|
|
||||||
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
||||||
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
||||||
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
|
|||||||
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
|
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
|
||||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||||
};
|
};
|
||||||
/*
|
|
||||||
static const v128u64_t final_s[16] =
|
|
||||||
{
|
|
||||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
|
||||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
|
||||||
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
|
|
||||||
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
|
|
||||||
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
|
|
||||||
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
|
|
||||||
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
|
|
||||||
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
|
|
||||||
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
|
|
||||||
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
|
|
||||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
|
||||||
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
|
|
||||||
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
|
|
||||||
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
|
|
||||||
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
|
|
||||||
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
|
|
||||||
};
|
|
||||||
*/
|
|
||||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||||
{
|
{
|
||||||
ctx->H[ 0] = v128_64( 0x4041424340414243 );
|
ctx->H[ 0] = v128_32( 0x40414243 );
|
||||||
ctx->H[ 1] = v128_64( 0x4445464744454647 );
|
ctx->H[ 1] = v128_32( 0x44454647 );
|
||||||
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
|
ctx->H[ 2] = v128_32( 0x48494A4B );
|
||||||
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
|
ctx->H[ 3] = v128_32( 0x4C4D4E4F );
|
||||||
ctx->H[ 4] = v128_64( 0x5051525350515253 );
|
ctx->H[ 4] = v128_32( 0x50515253 );
|
||||||
ctx->H[ 5] = v128_64( 0x5455565754555657 );
|
ctx->H[ 5] = v128_32( 0x54555657 );
|
||||||
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
|
ctx->H[ 6] = v128_32( 0x58595A5B );
|
||||||
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
|
ctx->H[ 7] = v128_32( 0x5C5D5E5F );
|
||||||
ctx->H[ 8] = v128_64( 0x6061626360616263 );
|
ctx->H[ 8] = v128_32( 0x60616263 );
|
||||||
ctx->H[ 9] = v128_64( 0x6465666764656667 );
|
ctx->H[ 9] = v128_32( 0x64656667 );
|
||||||
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
|
ctx->H[10] = v128_32( 0x68696A6B );
|
||||||
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
|
ctx->H[11] = v128_32( 0x6C6D6E6F );
|
||||||
ctx->H[12] = v128_64( 0x7071727370717273 );
|
ctx->H[12] = v128_32( 0x70717273 );
|
||||||
ctx->H[13] = v128_64( 0x7475767774757677 );
|
ctx->H[13] = v128_32( 0x74757677 );
|
||||||
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
|
ctx->H[14] = v128_32( 0x78797A7B );
|
||||||
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
|
ctx->H[15] = v128_32( 0x7C7D7E7F );
|
||||||
|
|
||||||
|
|
||||||
// for ( int i = 0; i < 16; i++ )
|
|
||||||
// sc->H[i] = v128_32( iv[i] );
|
|
||||||
ctx->ptr = 0;
|
ctx->ptr = 0;
|
||||||
ctx->bit_count = 0;
|
ctx->bit_count = 0;
|
||||||
}
|
}
|
||||||
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
|
|||||||
static void
|
static void
|
||||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
v128u64_t *vdata = (v128u64_t*)data;
|
v128u32_t *vdata = (v128u32_t*)data;
|
||||||
v128u64_t *buf;
|
v128u32_t *buf;
|
||||||
v128u64_t htmp[16];
|
v128u32_t htmp[16];
|
||||||
v128u64_t *h1, *h2;
|
v128u32_t *h1, *h2;
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
|||||||
ptr += clen;
|
ptr += clen;
|
||||||
if ( ptr == buf_size )
|
if ( ptr == buf_size )
|
||||||
{
|
{
|
||||||
v128u64_t *ht;
|
v128u32_t *ht;
|
||||||
compress_small( buf, h1, h2 );
|
compress_small( buf, h1, h2 );
|
||||||
ht = h1;
|
ht = h1;
|
||||||
h1 = h2;
|
h1 = h2;
|
||||||
@@ -521,14 +497,14 @@ static void
|
|||||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32)
|
void *dst, size_t out_size_w32)
|
||||||
{
|
{
|
||||||
v128u64_t *buf;
|
v128u32_t *buf;
|
||||||
v128u64_t h1[16], h2[16], *h;
|
v128u32_t h1[16], h2[16], *h;
|
||||||
size_t ptr, u, v;
|
size_t ptr, u, v;
|
||||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
buf = sc->buf;
|
buf = sc->buf;
|
||||||
ptr = sc->ptr;
|
ptr = sc->ptr;
|
||||||
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
buf[ ptr>>2 ] = v128_32( 0x00000080 );
|
||||||
ptr += 4;
|
ptr += 4;
|
||||||
h = sc->H;
|
h = sc->H;
|
||||||
|
|
||||||
@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
for ( u = 0; u < 16; u ++ )
|
for ( u = 0; u < 16; u ++ )
|
||||||
buf[u] = h2[u];
|
buf[u] = h2[u];
|
||||||
|
|
||||||
compress_small( buf, (v128u64_t*)final_s, h1 );
|
compress_small( buf, (v128u32_t*)final_s, h1 );
|
||||||
|
|
||||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||||
casti_v128( dst, u ) = h1[v];
|
casti_v128( dst, u ) = h1[v];
|
||||||
@@ -1057,7 +1033,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-256 16 way 32
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BMW512_8WAY 1
|
#define BMW512_8WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BMW512_4WAY 1
|
#define BMW512_4WAY 1
|
||||||
|
|||||||
@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
|||||||
mj[14] = mm256_rol_64( M[14], 15 );
|
mj[14] = mm256_rol_64( M[14], 15 );
|
||||||
mj[15] = mm256_rol_64( M[15], 16 );
|
mj[15] = mm256_rol_64( M[15], 16 );
|
||||||
|
|
||||||
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
|
__m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
|
||||||
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
|
static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
|
||||||
|
0x0555555555555555ULL, 0x0555555555555555ULL };
|
||||||
|
|
||||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||||
K = _mm256_add_epi64( K, Kincr );
|
K = _mm256_add_epi64( K, Kincr );
|
||||||
@@ -950,7 +951,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-512 8 WAY
|
// BMW-512 8 WAY
|
||||||
|
|
||||||
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
|||||||
__m512i dH[16] )
|
__m512i dH[16] )
|
||||||
{
|
{
|
||||||
__m512i qt[32], xl, xh;
|
__m512i qt[32], xl, xh;
|
||||||
__m512i mh[16];
|
__m512i mh[16], mj[16];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for ( i = 0; i < 16; i++ )
|
for ( i = 0; i < 16; i++ )
|
||||||
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
|||||||
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
|
||||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||||
|
|
||||||
__m512i mj[16];
|
|
||||||
|
|
||||||
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||||
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||||
mj[ 2] = mm512_rol_64( M[ 2], 3 );
|
mj[ 2] = mm512_rol_64( M[ 2], 3 );
|
||||||
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
|||||||
mj[14] = mm512_rol_64( M[14], 15 );
|
mj[14] = mm512_rol_64( M[14], 15 );
|
||||||
mj[15] = mm512_rol_64( M[15], 16 );
|
mj[15] = mm512_rol_64( M[15], 16 );
|
||||||
|
|
||||||
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
|
__m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
|
||||||
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
|
static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
|
||||||
|
0x0555555555555555ULL, 0x0555555555555555ULL,
|
||||||
|
0x0555555555555555ULL, 0x0555555555555555ULL,
|
||||||
|
0x0555555555555555ULL, 0x0555555555555555ULL };
|
||||||
|
|
||||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||||
K = _mm512_add_epi64( K, Kincr );
|
K = _mm512_add_epi64( K, Kincr );
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
|
|||||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||||
// If reinterleaving is necessary it may be more efficient to use
|
// If reinterleaving is necessary it may be more efficient to use
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
struct _cube_4way_context
|
struct _cube_4way_context
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
|
|||||||
int r;
|
int r;
|
||||||
const int rounds = sp->rounds;
|
const int rounds = sp->rounds;
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
register __m512i x0, x1;
|
register __m512i x0, x1;
|
||||||
|
|
||||||
@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )
|
|||||||
|
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
register __m256i x0, x1, x2, x3, y0, y1;
|
register __m256i x0, x1, x2, x3, t0;
|
||||||
|
|
||||||
x0 = _mm256_load_si256( (__m256i*)sp->x );
|
x0 = _mm256_load_si256( (__m256i*)sp->x );
|
||||||
x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );
|
x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );
|
||||||
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
|
|||||||
{
|
{
|
||||||
x2 = _mm256_add_epi32( x0, x2 );
|
x2 = _mm256_add_epi32( x0, x2 );
|
||||||
x3 = _mm256_add_epi32( x1, x3 );
|
x3 = _mm256_add_epi32( x1, x3 );
|
||||||
y0 = mm256_rol_32( x1, 7 );
|
t0 = mm256_rol_32( x1, 7 );
|
||||||
y1 = mm256_rol_32( x0, 7 );
|
x1 = mm256_rol_32( x0, 7 );
|
||||||
x0 = _mm256_xor_si256( y0, x2 );
|
x0 = _mm256_xor_si256( t0, x2 );
|
||||||
x1 = _mm256_xor_si256( y1, x3 );
|
x1 = _mm256_xor_si256( x1, x3 );
|
||||||
x2 = mm256_swap128_64( x2 );
|
x2 = mm256_swap128_64( x2 );
|
||||||
x3 = mm256_swap128_64( x3 );
|
x3 = mm256_swap128_64( x3 );
|
||||||
x2 = _mm256_add_epi32( x0, x2 );
|
x2 = _mm256_add_epi32( x0, x2 );
|
||||||
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )
|
|||||||
|
|
||||||
#else // AVX, SSE2, NEON
|
#else // AVX, SSE2, NEON
|
||||||
|
|
||||||
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
|
v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;
|
||||||
|
|
||||||
x0 = casti_v128( sp->x, 0 );
|
x0 = casti_v128( sp->x, 0 );
|
||||||
x1 = casti_v128( sp->x, 1 );
|
x1 = casti_v128( sp->x, 1 );
|
||||||
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
|
|||||||
x5 = v128_add32( x1, x5 );
|
x5 = v128_add32( x1, x5 );
|
||||||
x6 = v128_add32( x2, x6 );
|
x6 = v128_add32( x2, x6 );
|
||||||
x7 = v128_add32( x3, x7 );
|
x7 = v128_add32( x3, x7 );
|
||||||
y0 = x2;
|
t0 = v128_rol32( x2, 7 );
|
||||||
y1 = x3;
|
t1 = v128_rol32( x3, 7 );
|
||||||
y2 = x0;
|
x2 = v128_rol32( x0, 7 );
|
||||||
y3 = x1;
|
x3 = v128_rol32( x1, 7 );
|
||||||
x0 = v128_rol32( y0, 7 );
|
x0 = v128_xor( t0, x4 );
|
||||||
x1 = v128_rol32( y1, 7 );
|
x1 = v128_xor( t1, x5 );
|
||||||
x2 = v128_rol32( y2, 7 );
|
|
||||||
x3 = v128_rol32( y3, 7 );
|
|
||||||
x0 = v128_xor( x0, x4 );
|
|
||||||
x1 = v128_xor( x1, x5 );
|
|
||||||
x2 = v128_xor( x2, x6 );
|
x2 = v128_xor( x2, x6 );
|
||||||
x3 = v128_xor( x3, x7 );
|
x3 = v128_xor( x3, x7 );
|
||||||
x4 = v128_swap64( x4 );
|
x4 = v128_swap64( x4 );
|
||||||
@@ -112,17 +108,13 @@ static void transform( cubehashParam *sp )
|
|||||||
x5 = v128_add32( x1, x5 );
|
x5 = v128_add32( x1, x5 );
|
||||||
x6 = v128_add32( x2, x6 );
|
x6 = v128_add32( x2, x6 );
|
||||||
x7 = v128_add32( x3, x7 );
|
x7 = v128_add32( x3, x7 );
|
||||||
y0 = x1;
|
t0 = v128_rol32( x1, 11 );
|
||||||
y1 = x0;
|
x1 = v128_rol32( x0, 11 );
|
||||||
y2 = x3;
|
t1 = v128_rol32( x3, 11 );
|
||||||
y3 = x2;
|
x3 = v128_rol32( x2, 11 );
|
||||||
x0 = v128_rol32( y0, 11 );
|
x0 = v128_xor( t0, x4 );
|
||||||
x1 = v128_rol32( y1, 11 );
|
|
||||||
x2 = v128_rol32( y2, 11 );
|
|
||||||
x3 = v128_rol32( y3, 11 );
|
|
||||||
x0 = v128_xor( x0, x4 );
|
|
||||||
x1 = v128_xor( x1, x5 );
|
x1 = v128_xor( x1, x5 );
|
||||||
x2 = v128_xor( x2, x6 );
|
x2 = v128_xor( t1, x6 );
|
||||||
x3 = v128_xor( x3, x7 );
|
x3 = v128_xor( x3, x7 );
|
||||||
x4 = v128_swap64_32( x4 );
|
x4 = v128_swap64_32( x4 );
|
||||||
x5 = v128_swap64_32( x5 );
|
x5 = v128_swap64_32( x5 );
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define ECHO_SUBBYTES4(state, j) \
|
#define ECHO_SUBBYTES4(state, j) \
|
||||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
|||||||
|
|
||||||
static void AddXor512(const void *a,const void *b,void *c)
|
static void AddXor512(const void *a,const void *b,void *c)
|
||||||
{
|
{
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||||
casti_m512i( b, 0 ) );
|
casti_m512i( b, 0 ) );
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
|||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
|||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#define GROESTL_4WAY_VAES 1
|
#define GROESTL_4WAY_VAES 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__) && defined(__VAES__)
|
#if defined(__AVX2__) && defined(__VAES__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
|
|
||||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||||
|
|||||||
@@ -43,7 +43,7 @@
|
|||||||
|
|
||||||
#define SIZE256 (SIZE_512/16)
|
#define SIZE256 (SIZE_512/16)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
|||||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__) && defined(__VAES__)
|
#if defined(__AVX2__) && defined(__VAES__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -33,7 +33,7 @@
|
|||||||
|
|
||||||
#define SIZE512 (SIZE_1024/16)
|
#define SIZE512 (SIZE_1024/16)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
|||||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||||
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
{ \
|
{ \
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
||||||
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
\
|
\
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
||||||
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
||||||
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
||||||
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
{ \
|
{ \
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
||||||
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
\
|
\
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
||||||
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
||||||
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ typedef struct {
|
|||||||
#else
|
#else
|
||||||
hashState_groestl groestl;
|
hashState_groestl groestl;
|
||||||
#endif
|
#endif
|
||||||
sha256_8way_context sha;
|
sha256_8x32_context sha;
|
||||||
} myrgr_8way_ctx_holder;
|
} myrgr_8way_ctx_holder;
|
||||||
|
|
||||||
myrgr_8way_ctx_holder myrgr_8way_ctx;
|
myrgr_8way_ctx_holder myrgr_8way_ctx;
|
||||||
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
|
|||||||
#else
|
#else
|
||||||
init_groestl( &myrgr_8way_ctx.groestl, 64 );
|
init_groestl( &myrgr_8way_ctx.groestl, 64 );
|
||||||
#endif
|
#endif
|
||||||
sha256_8way_init( &myrgr_8way_ctx.sha );
|
sha256_8x32_init( &myrgr_8way_ctx.sha );
|
||||||
}
|
}
|
||||||
|
|
||||||
void myriad_8way_hash( void *output, const void *input )
|
void myriad_8way_hash( void *output, const void *input )
|
||||||
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
|
|||||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||||
hash6, hash7 );
|
hash6, hash7 );
|
||||||
|
|
||||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
sha256_8x32_update( &ctx.sha, vhash, 64 );
|
||||||
sha256_8way_close( &ctx.sha, output );
|
sha256_8x32_close( &ctx.sha, output );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
hashState_groestl groestl;
|
hashState_groestl groestl;
|
||||||
sha256_4way_context sha;
|
sha256_4x32_context sha;
|
||||||
} myrgr_4way_ctx_holder;
|
} myrgr_4way_ctx_holder;
|
||||||
|
|
||||||
myrgr_4way_ctx_holder myrgr_4way_ctx;
|
myrgr_4way_ctx_holder myrgr_4way_ctx;
|
||||||
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
|
|||||||
void init_myrgr_4way_ctx()
|
void init_myrgr_4way_ctx()
|
||||||
{
|
{
|
||||||
init_groestl (&myrgr_4way_ctx.groestl, 64 );
|
init_groestl (&myrgr_4way_ctx.groestl, 64 );
|
||||||
sha256_4way_init( &myrgr_4way_ctx.sha );
|
sha256_4x32_init( &myrgr_4way_ctx.sha );
|
||||||
}
|
}
|
||||||
|
|
||||||
void myriad_4way_hash( void *output, const void *input )
|
void myriad_4way_hash( void *output, const void *input )
|
||||||
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )
|
|||||||
|
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
|
|
||||||
sha256_4way_update( &ctx.sha, vhash, 64 );
|
sha256_4x32_update( &ctx.sha, vhash, 64 );
|
||||||
sha256_4way_close( &ctx.sha, output );
|
sha256_4x32_close( &ctx.sha, output );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
|
|||||||
init_myrgr_ctx();
|
init_myrgr_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_myriad;
|
gate->scanhash = (void*)&scanhash_myriad;
|
||||||
gate->hash = (void*)&myriad_hash;
|
gate->hash = (void*)&myriad_hash;
|
||||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#define MYRGR_8WAY 1
|
#define MYRGR_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||||
#define MYRGR_4WAY 1
|
#define MYRGR_4WAY 1
|
||||||
|
|||||||
@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
|||||||
#define S1F MF
|
#define S1F MF
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Hamsi 8 way AVX512
|
// Hamsi 8 way AVX512
|
||||||
|
|
||||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
||||||
// had a 3% faster overall hashrate than than using movepi.
|
// had a 3% faster overall hashrate than than using movepi.
|
||||||
|
|
||||||
#define INPUT_BIG8 \
|
#define INPUT_BIG8 \
|
||||||
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
|||||||
tb = mm512_xoror( b, d, a ); \
|
tb = mm512_xoror( b, d, a ); \
|
||||||
a = _mm512_xor_si512( a, c ); \
|
a = _mm512_xor_si512( a, c ); \
|
||||||
b = mm512_xoror( td, tb, a ); \
|
b = mm512_xoror( td, tb, a ); \
|
||||||
td = mm512_xorand( a, td, tb ); \
|
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
||||||
a = c; \
|
a = c; \
|
||||||
c = mm512_xor3( tb, b, td ); \
|
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
||||||
d = mm512_not( td ); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define SBOX8( a, b, c, d ) \
|
#define SBOX8( a, b, c, d ) \
|
||||||
do { \
|
do { \
|
||||||
@@ -505,32 +503,28 @@ do { \
|
|||||||
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
|
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
|
||||||
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
|
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
|
||||||
s4 = mm512_swap64_32( s4 ); \
|
s4 = mm512_swap64_32( s4 ); \
|
||||||
s5 = mm512_swap64_32( s5 ); \
|
t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
|
||||||
sD = mm512_swap64_32( sD ); \
|
sD = mm512_swap64_32( sD ); \
|
||||||
sE = mm512_swap64_32( sE ); \
|
t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
|
||||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
|
||||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
|
||||||
L8( s0, t0, s9, t1 ); \
|
L8( s0, t0, s9, t1 ); \
|
||||||
s6 = mm512_swap64_32( s6 ); \
|
s6 = mm512_swap64_32( s6 ); \
|
||||||
sF = mm512_swap64_32( sF ); \
|
sF = mm512_swap64_32( sF ); \
|
||||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
|
||||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
|
t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
|
||||||
L8( s1, t2, sA, t3 ); \
|
L8( s1, t2, sA, t3 ); \
|
||||||
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
|
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
|
||||||
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
|
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
|
||||||
\
|
\
|
||||||
s7 = mm512_swap64_32( s7 ); \
|
t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
|
||||||
sC = mm512_swap64_32( sC ); \
|
t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
|
||||||
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
|
||||||
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
|
||||||
L8( s2, t4, sB, t5 ); \
|
L8( s2, t4, sB, t5 ); \
|
||||||
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
|
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
|
||||||
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
|
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
|
||||||
s6 = mm512_swap64_32( s6 ); \
|
s6 = mm512_swap64_32( s6 ); \
|
||||||
sF = mm512_swap64_32( sF ); \
|
sF = mm512_swap64_32( sF ); \
|
||||||
\
|
\
|
||||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
|
||||||
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
|
t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
|
||||||
L8( s3, t2, s8, t3 ); \
|
L8( s3, t2, s8, t3 ); \
|
||||||
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
|
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
|
||||||
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
|
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
|
||||||
@@ -539,21 +533,20 @@ do { \
|
|||||||
s7 = mm512_swap64_32( s7 ); \
|
s7 = mm512_swap64_32( s7 ); \
|
||||||
sC = mm512_swap64_32( sC ); \
|
sC = mm512_swap64_32( sC ); \
|
||||||
\
|
\
|
||||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
|
t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
|
||||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
||||||
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
|
t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
|
||||||
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
|
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
|
||||||
t3 = mm512_swap64_32( t3 ); \
|
t3 = mm512_swap64_32( t3 ); \
|
||||||
L8( t0, t1, t2, t3 ); \
|
L8( t0, t1, t2, t3 ); \
|
||||||
t3 = mm512_swap64_32( t3 ); \
|
|
||||||
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
||||||
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
|
s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
|
||||||
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
||||||
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
||||||
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
|
s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
|
||||||
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
||||||
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
|
s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
|
||||||
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
|
sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
|
||||||
\
|
\
|
||||||
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
|
||||||
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
|
||||||
@@ -1061,7 +1054,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
|||||||
WRITE_STATE_BIG8( sc );
|
WRITE_STATE_BIG8( sc );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
|
||||||
{
|
{
|
||||||
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||||
|
|
||||||
@@ -1073,7 +1066,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
|||||||
WRITE_STATE_BIG8( sc );
|
WRITE_STATE_BIG8( sc );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
void hamsi512_8x64_init( hamsi512_8x64_context *sc )
|
||||||
{
|
{
|
||||||
sc->partial_len = 0;
|
sc->partial_len = 0;
|
||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
@@ -1089,7 +1082,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
|||||||
sc->h[7] = v512_64( iv[7] );
|
sc->h[7] = v512_64( iv[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m512i *vdata = (__m512i*)data;
|
__m512i *vdata = (__m512i*)data;
|
||||||
@@ -1101,7 +1094,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
|||||||
sc->partial_len = len;
|
sc->partial_len = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
__m512i pad[1];
|
__m512i pad[1];
|
||||||
uint32_t ch, cl;
|
uint32_t ch, cl;
|
||||||
@@ -1122,7 +1115,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
|||||||
|
|
||||||
// Hamsi 4 way AVX2
|
// Hamsi 4 way AVX2
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define INPUT_BIG \
|
#define INPUT_BIG \
|
||||||
do { \
|
do { \
|
||||||
@@ -1160,6 +1153,94 @@ do { \
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#define INPUT_BIG_sub( db_i ) \
|
||||||
|
{ \
|
||||||
|
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
||||||
|
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||||
|
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||||
|
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||||
|
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||||
|
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||||
|
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||||
|
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||||
|
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||||
|
tp += 8; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define INPUT_BIG \
|
||||||
|
{ \
|
||||||
|
const __m256i db = *buf; \
|
||||||
|
const __m256i zero = m256_zero; \
|
||||||
|
const uint64_t *tp = (const uint64_t*)T512; \
|
||||||
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
||||||
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
||||||
|
INPUT_BIG_sub( db ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// dependent on the compiler unrolling the loop
|
||||||
#define INPUT_BIG \
|
#define INPUT_BIG \
|
||||||
do { \
|
do { \
|
||||||
__m256i db = *buf; \
|
__m256i db = *buf; \
|
||||||
@@ -1180,8 +1261,9 @@ do { \
|
|||||||
tp += 8; \
|
tp += 8; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
// v3, 15 instructions
|
||||||
#define SBOX( a, b, c, d ) \
|
#define SBOX( a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
__m256i tb, td; \
|
__m256i tb, td; \
|
||||||
@@ -1199,7 +1281,7 @@ do { \
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
/ v2, 16 instructions, 10 TL equivalent instructions
|
/ v2, 16 instructions
|
||||||
#define SBOX( a, b, c, d ) \
|
#define SBOX( a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
__m256i t = mm256_xorand( d, a, c ); \
|
__m256i t = mm256_xorand( d, a, c ); \
|
||||||
@@ -1219,7 +1301,7 @@ do { \
|
|||||||
do { \
|
do { \
|
||||||
a = mm256_rol_32( a, 13 ); \
|
a = mm256_rol_32( a, 13 ); \
|
||||||
c = mm256_rol_32( c, 3 ); \
|
c = mm256_rol_32( c, 3 ); \
|
||||||
b = mm256_xor3( a, b, c ); \
|
b = mm256_xor3( b, a, c ); \
|
||||||
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
||||||
b = mm256_rol_32( b, 1 ); \
|
b = mm256_rol_32( b, 1 ); \
|
||||||
d = mm256_rol_32( d, 7 ); \
|
d = mm256_rol_32( d, 7 ); \
|
||||||
@@ -1501,7 +1583,7 @@ do { /* order is important */ \
|
|||||||
sc->h[14] = CE; \
|
sc->h[14] = CE; \
|
||||||
sc->h[15] = CF;
|
sc->h[15] = CF;
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define INPUT_8X32 \
|
#define INPUT_8X32 \
|
||||||
{ \
|
{ \
|
||||||
@@ -1857,7 +1939,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,
|
|||||||
|
|
||||||
////////////
|
////////////
|
||||||
|
|
||||||
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
|
||||||
{
|
{
|
||||||
DECL_STATE_BIG
|
DECL_STATE_BIG
|
||||||
uint32_t tmp;
|
uint32_t tmp;
|
||||||
@@ -1881,7 +1963,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
|||||||
WRITE_STATE_BIG( sc );
|
WRITE_STATE_BIG( sc );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
|
||||||
{
|
{
|
||||||
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
|
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||||
DECL_STATE_BIG
|
DECL_STATE_BIG
|
||||||
@@ -1892,7 +1974,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
|||||||
WRITE_STATE_BIG( sc );
|
WRITE_STATE_BIG( sc );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
void hamsi512_4x64_init( hamsi512_4x64_context *sc )
|
||||||
{
|
{
|
||||||
sc->partial_len = 0;
|
sc->partial_len = 0;
|
||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
@@ -1907,7 +1989,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
|||||||
sc->h[7] = v256_64( iv[7] );
|
sc->h[7] = v256_64( iv[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
@@ -1919,7 +2001,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
|||||||
sc->partial_len = len;
|
sc->partial_len = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
__m256i pad[1];
|
__m256i pad[1];
|
||||||
uint32_t ch, cl;
|
uint32_t ch, cl;
|
||||||
@@ -1961,6 +2043,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
|||||||
sc->h[6] = c6; \
|
sc->h[6] = c6; \
|
||||||
sc->h[7] = c7;
|
sc->h[7] = c7;
|
||||||
|
|
||||||
|
#define INPUT_2x64_sub( db_i ) \
|
||||||
|
{ \
|
||||||
|
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
||||||
|
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||||
|
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||||
|
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||||
|
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||||
|
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||||
|
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||||
|
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||||
|
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||||
|
tp += 8; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define INPUT_2x64 \
|
||||||
|
{ \
|
||||||
|
const v128u64_t db = *buf; \
|
||||||
|
const v128u64_t zero = v128_zero; \
|
||||||
|
const uint64_t *tp = (const uint64_t*)T512; \
|
||||||
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
||||||
|
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
||||||
|
INPUT_2x64_sub( db ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Dependent on the compiler unrolling the loop.
|
||||||
#define INPUT_2x64 \
|
#define INPUT_2x64 \
|
||||||
{ \
|
{ \
|
||||||
v128u64_t db = *buf; \
|
v128u64_t db = *buf; \
|
||||||
@@ -1981,6 +2151,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
|||||||
tp += 8; \
|
tp += 8; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||||
#define SBOX_2x64( a, b, c, d ) \
|
#define SBOX_2x64( a, b, c, d ) \
|
||||||
@@ -2001,7 +2172,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
|||||||
{ \
|
{ \
|
||||||
a = v128_rol32( a, 13 ); \
|
a = v128_rol32( a, 13 ); \
|
||||||
c = v128_rol32( c, 3 ); \
|
c = v128_rol32( c, 3 ); \
|
||||||
b = v128_xor3( a, b, c ); \
|
b = v128_xor3( c, a, b ); \
|
||||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||||
b = v128_rol32( b, 1 ); \
|
b = v128_rol32( b, 1 ); \
|
||||||
d = v128_rol32( d, 7 ); \
|
d = v128_rol32( d, 7 ); \
|
||||||
|
|||||||
@@ -72,17 +72,17 @@ typedef struct
|
|||||||
size_t partial_len;
|
size_t partial_len;
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
} hamsi_4way_big_context;
|
} hamsi_4way_big_context;
|
||||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
typedef hamsi_4way_big_context hamsi512_4x64_context;
|
||||||
|
|
||||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
void hamsi512_4x64_init( hamsi512_4x64_context *sc );
|
||||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
|
||||||
size_t len );
|
size_t len );
|
||||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );
|
||||||
|
|
||||||
#define hamsi512_4x64_context hamsi512_4way_context
|
#define hamsi512_4way_context hamsi512_4x64_context
|
||||||
#define hamsi512_4x64_init hamsi512_4way_init
|
#define hamsi512_4way_init hamsi512_4x64_init
|
||||||
#define hamsi512_4x64_update hamsi512_4way_update
|
#define hamsi512_4way_update hamsi512_4x64_update
|
||||||
#define hamsi512_4x64_close hamsi512_4way_close
|
#define hamsi512_4way_close hamsi512_4x64_close
|
||||||
|
|
||||||
// Hamsi-512 8x32
|
// Hamsi-512 8x32
|
||||||
|
|
||||||
@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Hamsi-512 8x64
|
// Hamsi-512 8x64
|
||||||
|
|
||||||
@@ -115,17 +115,17 @@ typedef struct
|
|||||||
size_t partial_len;
|
size_t partial_len;
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
} hamsi_8way_big_context;
|
} hamsi_8way_big_context;
|
||||||
typedef hamsi_8way_big_context hamsi512_8way_context;
|
typedef hamsi_8way_big_context hamsi512_8x64_context;
|
||||||
|
|
||||||
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
void hamsi512_8x64_init( hamsi512_8x64_context *sc );
|
||||||
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
|
||||||
size_t len );
|
size_t len );
|
||||||
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );
|
||||||
|
|
||||||
#define hamsi512_8x64_context hamsi512_8way_context
|
#define hamsi512_8way_context hamsi512_8x64_context
|
||||||
#define hamsi512_8x64_init hamsi512_8way_init
|
#define hamsi512_8way_init hamsi512_8x64_init
|
||||||
#define hamsi512_8x64_update hamsi512_8way_update
|
#define hamsi512_8way_update hamsi512_8x64_update
|
||||||
#define hamsi512_8x64_close hamsi512_8way_close
|
#define hamsi512_8way_close hamsi512_8x64_close
|
||||||
|
|
||||||
// Hamsi-512 16x32
|
// Hamsi-512 16x32
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ extern "C"{
|
|||||||
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
||||||
//#endif
|
//#endif
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define v128_andnotxor( a, b, c ) \
|
#define v128_andnotxor( a, b, c ) \
|
||||||
@@ -583,7 +583,7 @@ do { \
|
|||||||
|
|
||||||
// Haval-256 8 way 32 bit avx2
|
// Haval-256 8 way 32 bit avx2
|
||||||
|
|
||||||
#if defined (__AVX512VL__)
|
#if defined (VL256)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define mm256_andnotxor( a, b, c ) \
|
#define mm256_andnotxor( a, b, c ) \
|
||||||
@@ -882,7 +882,7 @@ do { \
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define mm512_andnotxor( a, b, c ) \
|
#define mm512_andnotxor( a, b, c ) \
|
||||||
|
|||||||
@@ -82,12 +82,15 @@ typedef struct {
|
|||||||
typedef haval_4way_context haval256_5_4way_context;
|
typedef haval_4way_context haval256_5_4way_context;
|
||||||
|
|
||||||
void haval256_5_4way_init( void *cc );
|
void haval256_5_4way_init( void *cc );
|
||||||
|
|
||||||
void haval256_5_4way_update( void *cc, const void *data, size_t len );
|
void haval256_5_4way_update( void *cc, const void *data, size_t len );
|
||||||
//#define haval256_5_4way haval256_5_4way_update
|
//#define haval256_5_4way haval256_5_4way_update
|
||||||
|
|
||||||
void haval256_5_4way_close( void *cc, void *dst );
|
void haval256_5_4way_close( void *cc, void *dst );
|
||||||
|
|
||||||
|
#define haval256_4x32_context haval256_5_4way_context
|
||||||
|
#define haval256_4x32_init haval256_5_4way_init
|
||||||
|
#define haval256_4x32_update haval256_5_4way_update
|
||||||
|
#define haval256_4x32_close haval256_5_4way_close
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -100,14 +103,17 @@ typedef struct {
|
|||||||
typedef haval_8way_context haval256_5_8way_context;
|
typedef haval_8way_context haval256_5_8way_context;
|
||||||
|
|
||||||
void haval256_5_8way_init( void *cc );
|
void haval256_5_8way_init( void *cc );
|
||||||
|
|
||||||
void haval256_5_8way_update( void *cc, const void *data, size_t len );
|
void haval256_5_8way_update( void *cc, const void *data, size_t len );
|
||||||
|
|
||||||
void haval256_5_8way_close( void *cc, void *dst );
|
void haval256_5_8way_close( void *cc, void *dst );
|
||||||
|
|
||||||
|
#define haval256_8x32_context haval256_5_8way_context
|
||||||
|
#define haval256_8x32_init haval256_5_8way_init
|
||||||
|
#define haval256_8x32_update haval256_5_8way_update
|
||||||
|
#define haval256_8x32_close haval256_5_8way_close
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__m512i buf[32];
|
__m512i buf[32];
|
||||||
@@ -119,11 +125,14 @@ typedef struct {
|
|||||||
typedef haval_16way_context haval256_5_16way_context;
|
typedef haval_16way_context haval256_5_16way_context;
|
||||||
|
|
||||||
void haval256_5_16way_init( void *cc );
|
void haval256_5_16way_init( void *cc );
|
||||||
|
|
||||||
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
||||||
|
|
||||||
void haval256_5_16way_close( void *cc, void *dst );
|
void haval256_5_16way_close( void *cc, void *dst );
|
||||||
|
|
||||||
|
#define haval256_16x32_context haval256_5_16way_context
|
||||||
|
#define haval256_16x32_init haval256_5_16way_init
|
||||||
|
#define haval256_16x32_update haval256_5_16way_update
|
||||||
|
#define haval256_16x32_close haval256_5_16way_close
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
|
|||||||
(state)->H[15] = h7l; \
|
(state)->H[15] = h7l; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||||
{ \
|
{ \
|
||||||
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_256, not used with AVX512VL
|
|
||||||
|
|
||||||
#define notxorandnot( a, b, c ) \
|
#define notxorandnot( a, b, c ) \
|
||||||
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
||||||
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
void jh256_8x64_init( jh_8x64_context *sc )
|
void jh256_8x64_init( jh_8x64_context *sc )
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -55,7 +55,7 @@
|
|||||||
* <code>memcpy()</code>).
|
* <code>memcpy()</code>).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define KECCAK_8WAY 1
|
#define KECCAK_8WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define KECCAK_4WAY 1
|
#define KECCAK_4WAY 1
|
||||||
@@ -12,7 +12,7 @@
|
|||||||
#define KECCAK_2WAY 1
|
#define KECCAK_2WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define SHA3D_8WAY 1
|
#define SHA3D_8WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define SHA3D_4WAY 1
|
#define SHA3D_4WAY 1
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
|
|||||||
|
|
||||||
#define DO(x) x
|
#define DO(x) x
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define INPUT_BUF(size) do { \
|
#define INPUT_BUF(size) do { \
|
||||||
size_t j; \
|
size_t j; \
|
||||||
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
|||||||
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||||
size_t byte_len, size_t lim )
|
size_t byte_len, size_t lim )
|
||||||
{
|
{
|
||||||
unsigned eb;
|
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
|
||||||
union {
|
|
||||||
__m512i tmp[lim + 1];
|
|
||||||
uint64_t dummy; /* for alignment */
|
|
||||||
} u;
|
|
||||||
size_t j;
|
size_t j;
|
||||||
size_t m512_len = byte_len >> 3;
|
size_t m512_len = byte_len >> 3;
|
||||||
|
const unsigned eb = hard_coded_eb;
|
||||||
|
|
||||||
eb = hard_coded_eb;
|
|
||||||
if ( kc->ptr == (lim - 8) )
|
if ( kc->ptr == (lim - 8) )
|
||||||
{
|
{
|
||||||
const uint64_t t = eb | 0x8000000000000000;
|
const uint64_t t = eb | 0x8000000000000000;
|
||||||
u.tmp[0] = _mm512_set1_epi64( t );
|
tmp[0] = _mm512_set1_epi64( t );
|
||||||
j = 8;
|
j = 8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
j = lim - kc->ptr;
|
j = lim - kc->ptr;
|
||||||
u.tmp[0] = _mm512_set1_epi64( eb );
|
tmp[0] = _mm512_set1_epi64( eb );
|
||||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
memset_zero_512( tmp + 1, (j>>3) - 2 );
|
||||||
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||||
}
|
}
|
||||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
keccak64_8way_core( kc, tmp, j, lim );
|
||||||
/* Finalize the "lane complement" */
|
/* Finalize the "lane complement" */
|
||||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||||
@@ -194,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
|||||||
memcpy_512( dst, kc->w, m512_len );
|
memcpy_512( dst, kc->w, m512_len );
|
||||||
}
|
}
|
||||||
|
|
||||||
void keccak256_8way_init( void *kc )
|
void keccak256_8x64_init( void *kc )
|
||||||
{
|
{
|
||||||
keccak64_8way_init( kc, 256 );
|
keccak64_8way_init( kc, 256 );
|
||||||
}
|
}
|
||||||
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
|||||||
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||||
size_t lim )
|
size_t lim )
|
||||||
{
|
{
|
||||||
unsigned eb;
|
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
|
||||||
union {
|
|
||||||
__m256i tmp[lim + 1];
|
|
||||||
uint64_t dummy; /* for alignment */
|
|
||||||
} u;
|
|
||||||
size_t j;
|
size_t j;
|
||||||
size_t m256_len = byte_len >> 3;
|
size_t m256_len = byte_len >> 3;
|
||||||
|
const unsigned eb = hard_coded_eb;
|
||||||
|
|
||||||
eb = hard_coded_eb;
|
|
||||||
if ( kc->ptr == (lim - 8) )
|
if ( kc->ptr == (lim - 8) )
|
||||||
{
|
{
|
||||||
const uint64_t t = eb | 0x8000000000000000;
|
const uint64_t t = eb | 0x8000000000000000;
|
||||||
u.tmp[0] = _mm256_set1_epi64x( t );
|
tmp[0] = _mm256_set1_epi64x( t );
|
||||||
j = 8;
|
j = 8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
j = lim - kc->ptr;
|
j = lim - kc->ptr;
|
||||||
u.tmp[0] = _mm256_set1_epi64x( eb );
|
tmp[0] = _mm256_set1_epi64x( eb );
|
||||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
memset_zero_256( tmp + 1, (j>>3) - 2 );
|
||||||
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||||
}
|
}
|
||||||
keccak64_core( kc, u.tmp, j, lim );
|
keccak64_core( kc, tmp, j, lim );
|
||||||
/* Finalize the "lane complement" */
|
/* Finalize the "lane complement" */
|
||||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
void sha3d_hash_8way(void *state, const void *input)
|
void sha3d_hash_8way(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
|
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
|
||||||
keccak256_8way_context ctx;
|
keccak256_8x64_context ctx;
|
||||||
|
|
||||||
keccak256_8x64_init( &ctx );
|
keccak256_8x64_init( &ctx );
|
||||||
keccak256_8x64_update( &ctx, input, 80 );
|
keccak256_8x64_update( &ctx, input, 80 );
|
||||||
@@ -69,7 +69,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
|||||||
void sha3d_hash_4way(void *state, const void *input)
|
void sha3d_hash_4way(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||||
keccak256_4way_context ctx;
|
keccak256_4x64_context ctx;
|
||||||
|
|
||||||
keccak256_4x64_init( &ctx );
|
keccak256_4x64_init( &ctx );
|
||||||
keccak256_4x64_update( &ctx, input, 80 );
|
keccak256_4x64_update( &ctx, input, 80 );
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||||
|
|
||||||
@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
|||||||
__m512i t = a0; \
|
__m512i t = a0; \
|
||||||
a0 = mm512_xoror( a3, a0, a1 ); \
|
a0 = mm512_xoror( a3, a0, a1 ); \
|
||||||
a2 = _mm512_xor_si512( a2, a3 ); \
|
a2 = _mm512_xor_si512( a2, a3 ); \
|
||||||
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
|
||||||
a3 = mm512_xorand( a2, a3, t ); \
|
a3 = mm512_xorand( a2, a3, t ); \
|
||||||
a2 = mm512_xorand( a1, a2, a0); \
|
a2 = mm512_xorand( a1, a2, a0); \
|
||||||
a1 = _mm512_or_si512( a1, a3 ); \
|
a1 = _mm512_or_si512( a1, a3 ); \
|
||||||
a3 = _mm512_xor_si512( a3, a2 ); \
|
a3 = _mm512_xor_si512( a3, a2 ); \
|
||||||
t = _mm512_xor_si512( t, a1 ); \
|
t = _mm512_xor_si512( t, a1 ); \
|
||||||
a2 = _mm512_and_si512( a2, a1 ); \
|
a2 = _mm512_and_si512( a2, a1 ); \
|
||||||
a1 = mm512_xnor( a1, a0 ); \
|
a1 = mm512_nxor( a1, a0 ); \
|
||||||
a0 = t; \
|
a0 = t; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
uint32_t hash[8*4] __attribute((aligned(128)));
|
uint32_t hash[8*4] __attribute((aligned(128)));
|
||||||
__m512i* chainv = state->chainv;
|
__m512i* chainv = state->chainv;
|
||||||
__m512i t[2];
|
__m512i t[2];
|
||||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
/*---- blank round with m=0 ----*/
|
/*---- blank round with m=0 ----*/
|
||||||
rnd512_4way( state, NULL );
|
rnd512_4way( state, NULL );
|
||||||
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||||
|
|
||||||
casti_m512i( b,0 ) = _mm512_shuffle_epi8(
|
casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
|
||||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
|
||||||
casti_m512i( b,1 ) = _mm512_shuffle_epi8(
|
|
||||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
|
||||||
|
|
||||||
rnd512_4way( state, NULL );
|
rnd512_4way( state, NULL );
|
||||||
|
|
||||||
@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||||
|
|
||||||
casti_m512i( b,2 ) = _mm512_shuffle_epi8(
|
casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
|
||||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
|
||||||
casti_m512i( b,3 ) = _mm512_shuffle_epi8(
|
|
||||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||||
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
|||||||
__m512i msg[2];
|
__m512i msg[2];
|
||||||
int i;
|
int i;
|
||||||
int blocks = (int)len >> 5;
|
int blocks = (int)len >> 5;
|
||||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
state->rembytes = (int)len & 0x1F;
|
state->rembytes = (int)len & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// remaining data bytes
|
// remaining data bytes
|
||||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
buffer[0] = mm512_bswap_32( vdata[0] );
|
||||||
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
|||||||
__m512i msg[2];
|
__m512i msg[2];
|
||||||
int i;
|
int i;
|
||||||
const int blocks = (int)( inlen >> 5 );
|
const int blocks = (int)( inlen >> 5 );
|
||||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
state->rembytes = inlen & 0x1F;
|
state->rembytes = inlen & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
__m512i msg[2];
|
__m512i msg[2];
|
||||||
int i;
|
int i;
|
||||||
const int blocks = (int)( inlen >> 5 );
|
const int blocks = (int)( inlen >> 5 );
|
||||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
state->rembytes = inlen & 0x1F;
|
state->rembytes = inlen & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||||
rnd512_4way( state, msg );
|
rnd512_4way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -524,8 +512,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
a = _mm256_xor_si256( a, c0 ); \
|
a = _mm256_xor_si256( a, c0 ); \
|
||||||
b = _mm256_xor_si256( b, c1 );
|
b = _mm256_xor_si256( b, c1 );
|
||||||
|
|
||||||
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
|
#if defined(VL256)
|
||||||
#if defined(__AVX512VL__)
|
|
||||||
|
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
@@ -540,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
__m256i t = a0; \
|
__m256i t = a0; \
|
||||||
a0 = mm256_xoror( a3, a0, a1 ); \
|
a0 = mm256_xoror( a3, a0, a1 ); \
|
||||||
a2 = _mm256_xor_si256( a2, a3 ); \
|
a2 = _mm256_xor_si256( a2, a3 ); \
|
||||||
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
|
||||||
a3 = mm256_xorand( a2, a3, t ); \
|
a3 = mm256_xorand( a2, a3, t ); \
|
||||||
a2 = mm256_xorand( a1, a2, a0); \
|
a2 = mm256_xorand( a1, a2, a0); \
|
||||||
a1 = _mm256_or_si256( a1, a3 ); \
|
a1 = _mm256_or_si256( a1, a3 ); \
|
||||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||||
t = _mm256_xor_si256( t, a1 ); \
|
t = _mm256_xor_si256( t, a1 ); \
|
||||||
a2 = _mm256_and_si256( a2, a1 ); \
|
a2 = _mm256_and_si256( a2, a1 ); \
|
||||||
a1 = mm256_xnor( a1, a0 ); \
|
a1 = mm256_nxor( a1, a0 ); \
|
||||||
a0 = t; \
|
a0 = t; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -776,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
|||||||
uint32 hash[8*2] __attribute((aligned(64)));
|
uint32 hash[8*2] __attribute((aligned(64)));
|
||||||
__m256i* chainv = state->chainv;
|
__m256i* chainv = state->chainv;
|
||||||
__m256i t0, t1;
|
__m256i t0, t1;
|
||||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
/*---- blank round with m=0 ----*/
|
/*---- blank round with m=0 ----*/
|
||||||
rnd512_2way( state, NULL );
|
rnd512_2way( state, NULL );
|
||||||
|
|
||||||
@@ -792,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
|||||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||||
|
|
||||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
|
||||||
|
|
||||||
rnd512_2way( state, NULL );
|
rnd512_2way( state, NULL );
|
||||||
|
|
||||||
@@ -810,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
|||||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||||
|
|
||||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||||
@@ -848,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
|||||||
__m256i msg[2];
|
__m256i msg[2];
|
||||||
int i;
|
int i;
|
||||||
int blocks = (int)len >> 5;
|
int blocks = (int)len >> 5;
|
||||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
state-> rembytes = (int)len & 0x1F;
|
state-> rembytes = (int)len & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -865,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// remaining data bytes
|
// remaining data bytes
|
||||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
buffer[0] = mm256_bswap_32( vdata[0] );
|
||||||
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@@ -917,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
|||||||
__m256i msg[2];
|
__m256i msg[2];
|
||||||
int i;
|
int i;
|
||||||
const int blocks = (int)( inlen >> 5 );
|
const int blocks = (int)( inlen >> 5 );
|
||||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
|
|
||||||
state->rembytes = inlen & 0x1F;
|
state->rembytes = inlen & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -934,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
@@ -962,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
|||||||
__m256i msg[2];
|
__m256i msg[2];
|
||||||
int i;
|
int i;
|
||||||
const int blocks = (int)( inlen >> 5 );
|
const int blocks = (int)( inlen >> 5 );
|
||||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
|
|
||||||
state->rembytes = inlen & 0x1F;
|
state->rembytes = inlen & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
{
|
{
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -979,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
|||||||
if ( state->rembytes )
|
if ( state->rembytes )
|
||||||
{
|
{
|
||||||
// padding of partial block
|
// padding of partial block
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||||
rnd512_2way( state, msg );
|
rnd512_2way( state, msg );
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,7 @@
|
|||||||
#define LIMIT_512 128
|
#define LIMIT_512 128
|
||||||
/*********************************/
|
/*********************************/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t buffer[8*4];
|
uint32_t buffer[8*4];
|
||||||
|
|||||||
@@ -28,8 +28,7 @@
|
|||||||
a = v128_xor( a, c0 ); \
|
a = v128_xor( a, c0 ); \
|
||||||
b = v128_xor( b, c1 ); \
|
b = v128_xor( b, c1 ); \
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_512 AVX10_256
|
|
||||||
|
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
@@ -48,47 +47,40 @@
|
|||||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__ARM_NEON)
|
|
||||||
|
#elif defined(__ARM_NEON) || defined(__SSE2__)
|
||||||
|
|
||||||
// { a1_0, 0, a1_0, a1_0 }
|
// { a1_0, 0, a1_0, a1_0 }
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
|
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
|
||||||
a0 = v128_alignr32( a1, b, 1 ); \
|
a0 = v128_alignr32( a1, b, 1 ); \
|
||||||
a1 = v128_alignr32( b, a1, 1 ); \
|
a1 = v128_alignr32( b, a1, 1 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // assume SSE2
|
#else
|
||||||
|
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
|
||||||
#define MULT2( a0, a1 ) \
|
|
||||||
{ \
|
|
||||||
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
|
|
||||||
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
|
||||||
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_512 AVX10_256
|
|
||||||
|
|
||||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||||
{ \
|
{ \
|
||||||
v128_t t = a0; \
|
v128_t t = a0; \
|
||||||
a0 = v128_xoror( a3, a0, a1 ); \
|
a0 = v128_xoror( a3, a0, a1 ); \
|
||||||
a2 = v128_xor( a2, a3 ); \
|
a2 = v128_xor( a2, a3 ); \
|
||||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
|
||||||
a3 = v128_xorand( a2, a3, t ); \
|
a3 = v128_xorand( a2, a3, t ); \
|
||||||
a2 = v128_xorand( a1, a2, a0 ); \
|
a2 = v128_xorand( a1, a2, a0 ); \
|
||||||
a1 = v128_or( a1, a3 ); \
|
a1 = v128_or( a1, a3 ); \
|
||||||
a3 = v128_xor( a3, a2 ); \
|
a3 = v128_xor( a3, a2 ); \
|
||||||
t = v128_xor( t, a1 ); \
|
t = v128_xor( t, a1 ); \
|
||||||
a2 = v128_and( a2, a1 ); \
|
a2 = v128_and( a2, a1 ); \
|
||||||
a1 = v128_xnor( a1, a0 ); \
|
a1 = v128_nxor( a1, a0 ); \
|
||||||
a0 = t; \
|
a0 = t; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#elif defined(__ARM_NEON) || defined(__SSE2__)
|
||||||
|
|
||||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
|||||||
|
|
||||||
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||||
const void* data, size_t inlen );
|
const void* data, size_t inlen );
|
||||||
#endif // LUFFA_FOR_SSE2_H___
|
#endif // LUFFA_FOR_SSE2_H__
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
#include "algo/groestl/sph_groestl.h"
|
#include "algo/groestl/sph_groestl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define ALLIUM_16WAY 1
|
#define ALLIUM_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define ALLIUM_8WAY 1
|
#define ALLIUM_8WAY 1
|
||||||
@@ -26,9 +26,9 @@
|
|||||||
#if defined (ALLIUM_16WAY)
|
#if defined (ALLIUM_16WAY)
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
keccak256_8way_context keccak;
|
keccak256_8x64_context keccak;
|
||||||
cube_4way_2buf_context cube;
|
cube_4way_2buf_context cube;
|
||||||
skein256_8way_context skein;
|
skein256_8x64_context skein;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl256_4way_context groestl;
|
groestl256_4way_context groestl;
|
||||||
#else
|
#else
|
||||||
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
|||||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||||
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
|||||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
hash15, 256 );
|
hash15, 256 );
|
||||||
|
|
||||||
keccak256_8way_init( &ctx.keccak );
|
keccak256_8x64_init( &ctx.keccak );
|
||||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
|
||||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
keccak256_8x64_close( &ctx.keccak, vhashA);
|
||||||
keccak256_8way_init( &ctx.keccak );
|
keccak256_8x64_init( &ctx.keccak );
|
||||||
keccak256_8way_update( &ctx.keccak, vhashB, 32 );
|
keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
|
||||||
keccak256_8way_close( &ctx.keccak, vhashB);
|
keccak256_8x64_close( &ctx.keccak, vhashB);
|
||||||
|
|
||||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
vhashA, 256 );
|
vhashA, 256 );
|
||||||
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
|||||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
hash15, 256 );
|
hash15, 256 );
|
||||||
|
|
||||||
skein256_8way_init( &ctx.skein );
|
skein256_8x64_init( &ctx.skein );
|
||||||
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
skein256_8x64_update( &ctx.skein, vhashA, 32 );
|
||||||
skein256_8way_close( &ctx.skein, vhashA );
|
skein256_8x64_close( &ctx.skein, vhashA );
|
||||||
skein256_8way_init( &ctx.skein );
|
skein256_8x64_init( &ctx.skein );
|
||||||
skein256_8way_update( &ctx.skein, vhashB, 32 );
|
skein256_8x64_update( &ctx.skein, vhashB, 32 );
|
||||||
skein256_8way_close( &ctx.skein, vhashB );
|
skein256_8x64_close( &ctx.skein, vhashB );
|
||||||
|
|
||||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
vhashA, 256 );
|
vhashA, 256 );
|
||||||
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
|||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
|||||||
#elif defined (ALLIUM_8WAY)
|
#elif defined (ALLIUM_8WAY)
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
keccak256_4way_context keccak;
|
keccak256_4x64_context keccak;
|
||||||
cube_2way_context cube;
|
cube_2way_context cube;
|
||||||
skein256_4way_context skein;
|
skein256_4x64_context skein;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl256_2way_context groestl;
|
groestl256_2way_context groestl;
|
||||||
#else
|
#else
|
||||||
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
|||||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
vhashA, 256 );
|
vhashA, 256 );
|
||||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
keccak256_4way_init( &ctx.keccak );
|
keccak256_4x64_init( &ctx.keccak );
|
||||||
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhashA );
|
keccak256_4x64_close( &ctx.keccak, vhashA );
|
||||||
keccak256_4way_init( &ctx.keccak );
|
keccak256_4x64_init( &ctx.keccak );
|
||||||
keccak256_4way_update( &ctx.keccak, vhashB, 32 );
|
keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhashB );
|
keccak256_4x64_close( &ctx.keccak, vhashB );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||||
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
|||||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
skein256_4way_init( &ctx.skein );
|
skein256_4x64_init( &ctx.skein );
|
||||||
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
skein256_4x64_update( &ctx.skein, vhashA, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhashA );
|
skein256_4x64_close( &ctx.skein, vhashA );
|
||||||
skein256_4way_init( &ctx.skein );
|
skein256_4x64_init( &ctx.skein );
|
||||||
skein256_4way_update( &ctx.skein, vhashB, 32 );
|
skein256_4x64_update( &ctx.skein, vhashB, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhashB );
|
skein256_4x64_close( &ctx.skein, vhashB );
|
||||||
|
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
|
|
||||||
@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
|||||||
n+ 3, n+ 2, n+ 1, n );
|
n+ 3, n+ 2, n+ 1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces
|
// Partialy prehash second block without touching nonces
|
||||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
|||||||
uint64_t *hash3 = (uint64_t*)hash+12;
|
uint64_t *hash3 = (uint64_t*)hash+12;
|
||||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||||
|
|
||||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||||
@@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
|||||||
block_buf[15] = v128_32( 640 );
|
block_buf[15] = v128_32( 640 );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces
|
// Partialy prehash second block without touching nonces
|
||||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
|||||||
//
|
//
|
||||||
// 1 way
|
// 1 way
|
||||||
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
blake256_context blake;
|
blake256_context blake;
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2REV3_16WAY 1
|
#define LYRA2REV3_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2REV3_8WAY 1
|
#define LYRA2REV3_8WAY 1
|
||||||
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
|
|||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2REV2_16WAY 1
|
#define LYRA2REV2_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2REV2_8WAY 1
|
#define LYRA2REV2_8WAY 1
|
||||||
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
|
|||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define PHI2_8WAY 1
|
#define PHI2_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define PHI2_4WAY 1
|
#define PHI2_4WAY 1
|
||||||
|
|||||||
@@ -41,7 +41,7 @@
|
|||||||
// lyra2z330, lyra2h,
|
// lyra2z330, lyra2h,
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
|||||||
|
|
||||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||||
|
|||||||
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
|
|||||||
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static __thread blake256_4way_context l2h_4way_blake_mid;
|
static __thread blake256_4x32_context l2h_4way_blake_mid;
|
||||||
|
|
||||||
void lyra2h_4way_midstate( const void* input )
|
void lyra2h_4way_midstate( const void* input )
|
||||||
{
|
{
|
||||||
blake256_4way_init( &l2h_4way_blake_mid );
|
blake256_4x32_init( &l2h_4way_blake_mid );
|
||||||
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
|
blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void lyra2h_4way_hash( void *state, const void *input )
|
void lyra2h_4way_hash( void *state, const void *input )
|
||||||
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
|
|||||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
|
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
|
||||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
|
||||||
blake256_4way_close( &ctx_blake, vhash );
|
blake256_4x32_close( &ctx_blake, vhash );
|
||||||
|
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
|
|
||||||
|
|||||||
@@ -7,25 +7,24 @@
|
|||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/cubehash/cube-hash-2way.h"
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined (LYRA2REV2_16WAY)
|
#if defined (LYRA2REV2_16WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_16way_context blake;
|
blake256_16x32_context blake;
|
||||||
keccak256_8way_context keccak;
|
keccak256_8x64_context keccak;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
skein256_8way_context skein;
|
skein256_8x64_context skein;
|
||||||
bmw256_16way_context bmw;
|
bmw256_16x32_context bmw;
|
||||||
} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
|
} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
|
static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev2_16way_ctx()
|
bool init_lyra2rev2_16way_ctx()
|
||||||
{
|
{
|
||||||
keccak256_8way_init( &l2v2_16way_ctx.keccak );
|
keccak256_8x64_init( &l2v2_16way_ctx.keccak );
|
||||||
cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
|
cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
|
||||||
skein256_8way_init( &l2v2_16way_ctx.skein );
|
skein256_8x64_init( &l2v2_16way_ctx.skein );
|
||||||
bmw256_16way_init( &l2v2_16way_ctx.bmw );
|
bmw256_16x32_init( &l2v2_16way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
|||||||
lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
|
memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
|
||||||
|
|
||||||
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
|
blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
|
||||||
blake256_16way_close( &ctx.blake, vhash );
|
blake256_16x32_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3,
|
dintrlv_16x32( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7,
|
hash4, hash5, hash6, hash7,
|
||||||
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
|||||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, 256 );
|
hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
keccak256_8way_update( &ctx.keccak, vhash, 32 );
|
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
|
||||||
keccak256_8way_close( &ctx.keccak, vhash );
|
keccak256_8x64_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
intrlv_8x64( vhash, hash8, hash9, hash10, hash11,
|
intrlv_8x64( vhash, hash8, hash9, hash10, hash11,
|
||||||
hash12, hash13, hash14, hash15, 256 );
|
hash12, hash13, hash14, hash15, 256 );
|
||||||
|
|
||||||
keccak256_8way_init( &ctx.keccak );
|
keccak256_8x64_init( &ctx.keccak );
|
||||||
keccak256_8way_update( &ctx.keccak, vhash, 32 );
|
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
|
||||||
keccak256_8way_close( &ctx.keccak, vhash );
|
keccak256_8x64_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||||
@@ -122,22 +121,21 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, 256 );
|
hash4, hash5, hash6, hash7, 256 );
|
||||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
skein256_8x64_update( &ctx.skein, vhash, 32 );
|
||||||
skein256_8way_close( &ctx.skein, vhash );
|
skein256_8x64_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12,
|
intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12,
|
||||||
hash13, hash14, hash15, 256 );
|
hash13, hash14, hash15, 256 );
|
||||||
|
|
||||||
skein256_8way_init( &ctx.skein );
|
skein256_8x64_init( &ctx.skein );
|
||||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
skein256_8x64_update( &ctx.skein, vhash, 32 );
|
||||||
skein256_8way_close( &ctx.skein, vhash );
|
skein256_8x64_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||||
|
|
||||||
|
|
||||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||||
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
||||||
cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
|
cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
|
||||||
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
|||||||
hash8, hash9, hash10, hash11,
|
hash8, hash9, hash10, hash11,
|
||||||
hash12, hash13, hash14, hash15, 256 );
|
hash12, hash13, hash14, hash15, 256 );
|
||||||
|
|
||||||
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_16way_close( &ctx.bmw, state );
|
bmw256_16x32_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
||||||
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||||
blake256_16way_init( &l2v2_16way_ctx.blake );
|
blake256_16x32_init( &l2v2_16way_ctx.blake );
|
||||||
blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
|
blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
#elif defined (LYRA2REV2_8WAY)
|
#elif defined (LYRA2REV2_8WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_8way_context blake;
|
blake256_8x32_context blake;
|
||||||
keccak256_4way_context keccak;
|
keccak256_4x64_context keccak;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
skein256_4way_context skein;
|
skein256_4x64_context skein;
|
||||||
bmw256_8way_context bmw;
|
bmw256_8x32_context bmw;
|
||||||
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
|
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
|
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev2_8way_ctx()
|
bool init_lyra2rev2_8way_ctx()
|
||||||
{
|
{
|
||||||
keccak256_4way_init( &l2v2_8way_ctx.keccak );
|
keccak256_4x64_init( &l2v2_8way_ctx.keccak );
|
||||||
cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
|
cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
|
||||||
skein256_4way_init( &l2v2_8way_ctx.skein );
|
skein256_4x64_init( &l2v2_8way_ctx.skein );
|
||||||
bmw256_8way_init( &l2v2_8way_ctx.bmw );
|
bmw256_8x32_init( &l2v2_8way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
|||||||
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
|
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
|
||||||
|
|
||||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
|
||||||
blake256_8way_close( &ctx.blake, vhash );
|
blake256_8x32_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
|
|
||||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||||
keccak256_4way_update( &ctx.keccak, vhash, 32 );
|
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhash );
|
keccak256_4x64_close( &ctx.keccak, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||||
keccak256_4way_init( &ctx.keccak );
|
keccak256_4x64_init( &ctx.keccak );
|
||||||
keccak256_4way_update( &ctx.keccak, vhash, 32 );
|
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhash );
|
keccak256_4x64_close( &ctx.keccak, vhash );
|
||||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
|
|
||||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||||
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
|||||||
LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
|
LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
|
||||||
|
|
||||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||||
skein256_4way_update( &ctx.skein, vhash, 32 );
|
skein256_4x64_update( &ctx.skein, vhash, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhash );
|
skein256_4x64_close( &ctx.skein, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||||
skein256_4way_init( &ctx.skein );
|
skein256_4x64_init( &ctx.skein );
|
||||||
skein256_4way_update( &ctx.skein, vhash, 32 );
|
skein256_4x64_update( &ctx.skein, vhash, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhash );
|
skein256_4x64_close( &ctx.skein, vhash );
|
||||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
|
|
||||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||||
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
|||||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, 256 );
|
hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_8way_close( &ctx.bmw, state );
|
bmw256_8x32_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
||||||
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||||
blake256_8way_init( &l2v2_8way_ctx.blake );
|
blake256_8x32_init( &l2v2_8way_ctx.blake );
|
||||||
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
|
blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
#elif defined (LYRA2REV2_4WAY)
|
#elif defined (LYRA2REV2_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_4way_context blake;
|
blake256_4x32_context blake;
|
||||||
keccak256_4way_context keccak;
|
keccak256_4x64_context keccak;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
skein256_4way_context skein;
|
skein256_4x64_context skein;
|
||||||
bmw256_4way_context bmw;
|
bmw256_4x32_context bmw;
|
||||||
} lyra2v2_4way_ctx_holder;
|
} lyra2v2_4way_ctx_holder;
|
||||||
|
|
||||||
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
|
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev2_4way_ctx()
|
bool init_lyra2rev2_4way_ctx()
|
||||||
{
|
{
|
||||||
keccak256_4way_init( &l2v2_4way_ctx.keccak );
|
keccak256_4x64_init( &l2v2_4way_ctx.keccak );
|
||||||
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
|
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
|
||||||
skein256_4way_init( &l2v2_4way_ctx.skein );
|
skein256_4x64_init( &l2v2_4way_ctx.skein );
|
||||||
bmw256_4way_init( &l2v2_4way_ctx.bmw );
|
bmw256_4x32_init( &l2v2_4way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
|||||||
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
||||||
|
|
||||||
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
|
blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
|
||||||
blake256_4way_close( &ctx.blake, vhash );
|
blake256_4x32_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
rintrlv_4x32_4x64( vhash64, vhash, 256 );
|
rintrlv_4x32_4x64( vhash64, vhash, 256 );
|
||||||
|
|
||||||
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
|
keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
|
||||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
keccak256_4x64_close( &ctx.keccak, vhash64 );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||||
|
|
||||||
@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||||
|
|
||||||
skein256_4way_update( &ctx.skein, vhash64, 32 );
|
skein256_4x64_update( &ctx.skein, vhash64, 32 );
|
||||||
skein256_4way_close( &ctx.skein, vhash64 );
|
skein256_4x64_close( &ctx.skein, vhash64 );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||||
|
|
||||||
@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||||
|
|
||||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_4way_close( &ctx.bmw, state );
|
bmw256_4x32_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||||
|
|
||||||
blake256_4way_init( &l2v2_4way_ctx.blake );
|
blake256_4x32_init( &l2v2_4way_ctx.blake );
|
||||||
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
|
blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -9,18 +9,18 @@
|
|||||||
#if defined (LYRA2REV3_16WAY)
|
#if defined (LYRA2REV3_16WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_16way_context blake;
|
blake256_16x32_context blake;
|
||||||
cube_4way_context cube;
|
cube_4way_context cube;
|
||||||
bmw256_16way_context bmw;
|
bmw256_16x32_context bmw;
|
||||||
} lyra2v3_16way_ctx_holder;
|
} lyra2v3_16way_ctx_holder;
|
||||||
|
|
||||||
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
|
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev3_16way_ctx()
|
bool init_lyra2rev3_16way_ctx()
|
||||||
{
|
{
|
||||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
blake256_16x32_init( &l2v3_16way_ctx.blake );
|
||||||
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
|
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
|
||||||
bmw256_16way_init( &l2v3_16way_ctx.bmw );
|
bmw256_16x32_init( &l2v3_16way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
|
|||||||
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
|
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
|
||||||
|
|
||||||
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
|
blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
|
||||||
blake256_16way_close( &ctx.blake, vhash );
|
blake256_16x32_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||||
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
|
|||||||
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
hash15, 256 );
|
hash15, 256 );
|
||||||
|
|
||||||
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_16way_close( &ctx.bmw, state );
|
bmw256_16x32_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
|
|
||||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||||
|
|
||||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
blake256_16x32_init( &l2v3_16way_ctx.blake );
|
||||||
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
#elif defined (LYRA2REV3_8WAY)
|
#elif defined (LYRA2REV3_8WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_8way_context blake;
|
blake256_8x32_context blake;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
bmw256_8way_context bmw;
|
bmw256_8x32_context bmw;
|
||||||
} lyra2v3_8way_ctx_holder;
|
} lyra2v3_8way_ctx_holder;
|
||||||
|
|
||||||
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
|
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev3_8way_ctx()
|
bool init_lyra2rev3_8way_ctx()
|
||||||
{
|
{
|
||||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
blake256_8x32_init( &l2v3_8way_ctx.blake );
|
||||||
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
|
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
|
||||||
bmw256_8way_init( &l2v3_8way_ctx.bmw );
|
bmw256_8x32_init( &l2v3_8way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
|||||||
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
|
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
|
||||||
|
|
||||||
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
|
blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
|
||||||
blake256_8way_close( &ctx.blake, vhash );
|
blake256_8x32_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
|||||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, 256 );
|
hash4, hash5, hash6, hash7, 256 );
|
||||||
|
|
||||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_8way_close( &ctx.bmw, state );
|
bmw256_8x32_close( &ctx.bmw, state );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
|
|
||||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
blake256_8x32_init( &l2v3_8way_ctx.blake );
|
||||||
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
#if defined (LYRA2REV3_4WAY)
|
#if defined (LYRA2REV3_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_4way_context blake;
|
blake256_4x32_context blake;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
bmw256_4way_context bmw;
|
bmw256_4x32_context bmw;
|
||||||
} lyra2v3_4way_ctx_holder;
|
} lyra2v3_4way_ctx_holder;
|
||||||
|
|
||||||
//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
|
||||||
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||||
|
|
||||||
bool init_lyra2rev3_4way_ctx()
|
bool init_lyra2rev3_4way_ctx()
|
||||||
{
|
{
|
||||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
blake256_4x32_init( &l2v3_4way_ctx.blake );
|
||||||
cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
|
cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
|
||||||
bmw256_4way_init( &l2v3_4way_ctx.bmw );
|
bmw256_4x32_init( &l2v3_4way_ctx.bmw );
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
|||||||
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
|
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
|
||||||
|
|
||||||
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
|
blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
|
||||||
blake256_4way_close( &ctx.blake, vhash );
|
blake256_4x32_close( &ctx.blake, vhash );
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
|
|
||||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||||
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
|||||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||||
|
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
|
||||||
bmw256_4way_close( &ctx.bmw, state );
|
bmw256_4x32_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||||
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
|||||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||||
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
|
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
|
||||||
|
|
||||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
blake256_4x32_init( &l2v3_4way_ctx.blake );
|
||||||
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
|
blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#include "algo/blake/blake256-hash.h"
|
#include "algo/blake/blake256-hash.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2Z_16WAY 1
|
#define LYRA2Z_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2Z_8WAY 1
|
#define LYRA2Z_8WAY 1
|
||||||
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
|||||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||||
|
|
||||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||||
|
|
||||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||||
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
|||||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
|||||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||||
|
|
||||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
|||||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces
|
// Partialy prehash second block without touching nonces
|
||||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
|
|||||||
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static __thread blake256_4way_context l2z_4way_blake_mid;
|
static __thread blake256_4x32_context l2z_4way_blake_mid;
|
||||||
|
|
||||||
void lyra2z_4way_midstate( const void* input )
|
void lyra2z_4way_midstate( const void* input )
|
||||||
{
|
{
|
||||||
blake256_4way_init( &l2z_4way_blake_mid );
|
blake256_4x32_init( &l2z_4way_blake_mid );
|
||||||
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
|
blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
||||||
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
|||||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||||
// blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||||
|
|
||||||
/*
|
|
||||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
|
||||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
|
||||||
blake256_4way_close( &ctx_blake, vhash );
|
|
||||||
*/
|
|
||||||
|
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
|
|
||||||
@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
|||||||
block_buf[15] = v128_32( 640 );
|
block_buf[15] = v128_32( 640 );
|
||||||
|
|
||||||
// Partialy prehash second block without touching nonces
|
// Partialy prehash second block without touching nonces
|
||||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||||
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
|||||||
#if defined(LYRA2Z_16WAY)
|
#if defined(LYRA2Z_16WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||||
// gate->hash = (void*)&lyra2z_16way_hash;
|
|
||||||
#elif defined(LYRA2Z_8WAY)
|
#elif defined(LYRA2Z_8WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||||
// gate->hash = (void*)&lyra2z_8way_hash;
|
|
||||||
#elif defined(LYRA2Z_4WAY)
|
#elif defined(LYRA2Z_4WAY)
|
||||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#include "algo/echo/echo-hash-4way.h"
|
#include "algo/echo/echo-hash-4way.h"
|
||||||
#elif defined(__AES__)
|
#elif defined(__AES__)
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
|||||||
@@ -27,7 +27,7 @@
|
|||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define G2W_4X64(a,b,c,d) \
|
#define G2W(a,b,c,d) \
|
||||||
a = _mm512_add_epi64( a, b ); \
|
a = _mm512_add_epi64( a, b ); \
|
||||||
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
|
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
|
||||||
c = _mm512_add_epi64( c, d ); \
|
c = _mm512_add_epi64( c, d ); \
|
||||||
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
|
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
G2W( s0, s1, s2, s3 ); \
|
||||||
s0 = mm512_shufll256_64( s0 ); \
|
s0 = mm512_shufll256_64( s0 ); \
|
||||||
s3 = mm512_swap256_128( s3 ); \
|
s3 = mm512_swap256_128( s3 ); \
|
||||||
s2 = mm512_shuflr256_64( s2 ); \
|
s2 = mm512_shuflr256_64( s2 ); \
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
G2W( s0, s1, s2, s3 ); \
|
||||||
s0 = mm512_shuflr256_64( s0 ); \
|
s0 = mm512_shuflr256_64( s0 ); \
|
||||||
s3 = mm512_swap256_128( s3 ); \
|
s3 = mm512_swap256_128( s3 ); \
|
||||||
s2 = mm512_shufll256_64( s2 );
|
s2 = mm512_shufll256_64( s2 );
|
||||||
|
|
||||||
/*
|
|
||||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
|
||||||
s3 = mm512_shufll256_64( s3 ); \
|
|
||||||
s1 = mm512_shuflr256_64( s1); \
|
|
||||||
s2 = mm512_swap256_128( s2 ); \
|
|
||||||
G2W_4X64( s0, s1, s2, s3 ); \
|
|
||||||
s3 = mm512_shuflr256_64( s3 ); \
|
|
||||||
s1 = mm512_shufll256_64( s1 ); \
|
|
||||||
s2 = mm512_swap256_128( s2 );
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#define G_4X64(a,b,c,d) \
|
#define G_AVX2(a,b,c,d) \
|
||||||
a = _mm256_add_epi64( a, b ); \
|
a = _mm256_add_epi64( a, b ); \
|
||||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||||
c = _mm256_add_epi64( c, d ); \
|
c = _mm256_add_epi64( c, d ); \
|
||||||
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
|
|
||||||
// Pivot about s1 instead of s0 reduces latency.
|
// Pivot about s1 instead of s0 reduces latency.
|
||||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_AVX2( s0, s1, s2, s3 ); \
|
||||||
s0 = mm256_shufll_64( s0 ); \
|
s0 = mm256_shufll_64( s0 ); \
|
||||||
s3 = mm256_swap_128( s3 ); \
|
s3 = mm256_swap_128( s3 ); \
|
||||||
s2 = mm256_shuflr_64( s2 ); \
|
s2 = mm256_shuflr_64( s2 ); \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_AVX2( s0, s1, s2, s3 ); \
|
||||||
s0 = mm256_shuflr_64( s0 ); \
|
s0 = mm256_shuflr_64( s0 ); \
|
||||||
s3 = mm256_swap_128( s3 ); \
|
s3 = mm256_swap_128( s3 ); \
|
||||||
s2 = mm256_shufll_64( s2 );
|
s2 = mm256_shufll_64( s2 );
|
||||||
|
|
||||||
/*
|
|
||||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
|
||||||
s3 = mm256_shufll_64( s3 ); \
|
|
||||||
s1 = mm256_shuflr_64( s1); \
|
|
||||||
s2 = mm256_swap_128( s2 ); \
|
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
|
||||||
s3 = mm256_shuflr_64( s3 ); \
|
|
||||||
s1 = mm256_shufll_64( s1 ); \
|
|
||||||
s2 = mm256_swap_128( s2 );
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
@@ -148,29 +124,29 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
|
|
||||||
// process 2 columns in parallel
|
// process 2 columns in parallel
|
||||||
// returns void, all args updated
|
// returns void, all args updated
|
||||||
#define G_2X64(a,b,c,d) \
|
#define G_128(a,b,c,d) \
|
||||||
a = v128_add64( a, b ); \
|
a = v128_add64( a, b ); \
|
||||||
d = v128_ror64( v128_xor( d, a), 32 ); \
|
d = v128_ror64xor( d, a, 32 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 24 ); \
|
b = v128_ror64xor( b, c, 24 ); \
|
||||||
a = v128_add64( a, b ); \
|
a = v128_add64( a, b ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
d = v128_ror64xor( d, a, 16 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 63 );
|
b = v128_ror64xor( b, c, 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
{ \
|
{ \
|
||||||
v128u64_t t; \
|
v128u64_t t; \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_128( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_128( s1, s3, s5, s7 ); \
|
||||||
t = v128_alignr64( s7, s6, 1 ); \
|
t = v128_alignr64( s7, s6, 1 ); \
|
||||||
s6 = v128_alignr64( s6, s7, 1 ); \
|
s6 = v128_alignr64( s6, s7, 1 ); \
|
||||||
s7 = t; \
|
s7 = t; \
|
||||||
t = v128_alignr64( s2, s3, 1 ); \
|
t = v128_alignr64( s2, s3, 1 ); \
|
||||||
s2 = v128_alignr64( s3, s2, 1 ); \
|
s2 = v128_alignr64( s3, s2, 1 ); \
|
||||||
s3 = t; \
|
s3 = t; \
|
||||||
G_2X64( s0, s2, s5, s6 ); \
|
G_128( s0, s2, s5, s6 ); \
|
||||||
G_2X64( s1, s3, s4, s7 ); \
|
G_128( s1, s3, s4, s7 ); \
|
||||||
t = v128_alignr64( s6, s7, 1 ); \
|
t = v128_alignr64( s6, s7, 1 ); \
|
||||||
s6 = v128_alignr64( s7, s6, 1 ); \
|
s6 = v128_alignr64( s7, s6, 1 ); \
|
||||||
s7 = t; \
|
s7 = t; \
|
||||||
@@ -195,10 +171,6 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
|
|
||||||
#endif // AVX2 else SSE2
|
#endif // AVX2 else SSE2
|
||||||
|
|
||||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|
||||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
#define G( r, i, a, b, c, d ) \
|
#define G( r, i, a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
a = a + b; \
|
a = a + b; \
|
||||||
@@ -222,7 +194,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
union _ovly_512
|
union _ovly_512
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
#include "cpuminer-config.h"
|
#include "cpuminer-config.h"
|
||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
|
|
||||||
#if !defined(__APPLE__)
|
|
||||||
|
|
||||||
#include <gmp.h>
|
#include <gmp.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@@ -21,7 +19,7 @@
|
|||||||
#define EPS1 DBL_EPSILON
|
#define EPS1 DBL_EPSILON
|
||||||
#define EPS2 3.0e-11
|
#define EPS2 3.0e-11
|
||||||
|
|
||||||
inline double exp_n( double xt )
|
static inline double exp_n( double xt )
|
||||||
{
|
{
|
||||||
if ( xt < -700.0 )
|
if ( xt < -700.0 )
|
||||||
return 0;
|
return 0;
|
||||||
@@ -33,7 +31,8 @@ inline double exp_n( double xt )
|
|||||||
return exp( xt );
|
return exp( xt );
|
||||||
}
|
}
|
||||||
|
|
||||||
inline double exp_n2( double x1, double x2 )
|
/*
|
||||||
|
static inline double exp_n2( double x1, double x2 )
|
||||||
{
|
{
|
||||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
||||||
p5 = 37., p6 = 700.;
|
p5 = 37., p6 = 700.;
|
||||||
@@ -53,6 +52,7 @@ inline double exp_n2( double x1, double x2 )
|
|||||||
else if ( xt > p6 - 1.e-200 )
|
else if ( xt > p6 - 1.e-200 )
|
||||||
return 0.;
|
return 0.;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
double swit2_( double wvnmb )
|
double swit2_( double wvnmb )
|
||||||
{
|
{
|
||||||
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // not apple
|
|
||||||
|
|
||||||
bool register_m7m_algo( algo_gate_t *gate )
|
bool register_m7m_algo( algo_gate_t *gate )
|
||||||
{
|
{
|
||||||
#if defined(__APPLE__)
|
gate->optimizations = SHA256_OPT;
|
||||||
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
|
||||||
return false;
|
|
||||||
#else
|
|
||||||
gate->optimizations = SHA_OPT;
|
|
||||||
init_m7m_ctx();
|
init_m7m_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||||
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
|
|||||||
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
|
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
|
||||||
opt_target_factor = 65536.0;
|
opt_target_factor = 65536.0;
|
||||||
return true;
|
return true;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,75 +0,0 @@
|
|||||||
// Copyright (c) 2014 The Magi developers
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cfloat>
|
|
||||||
#include <limits>
|
|
||||||
#include <math.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "magimath.h"
|
|
||||||
|
|
||||||
#define EPS1 (std::numeric_limits<double>::epsilon())
|
|
||||||
#define EPS2 3.0e-11
|
|
||||||
|
|
||||||
static void gauleg(double x1, double x2, double x[], double w[], const int n)
|
|
||||||
{
|
|
||||||
int m,j,i;
|
|
||||||
double z1, z, xm, xl, pp, p3, p2, p1;
|
|
||||||
m=(n+1)/2;
|
|
||||||
xm=0.5*(x2+x1);
|
|
||||||
xl=0.5*(x2-x1);
|
|
||||||
for (i=1;i<=m;i++) {
|
|
||||||
z=cos(3.141592654*(i-0.25)/(n+0.5));
|
|
||||||
do {
|
|
||||||
p1=1.0;
|
|
||||||
p2=0.0;
|
|
||||||
for (j=1;j<=n;j++) {
|
|
||||||
p3=p2;
|
|
||||||
p2=p1;
|
|
||||||
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
|
|
||||||
}
|
|
||||||
pp=n*(z*p1-p2)/(z*z-1.0);
|
|
||||||
z1=z;
|
|
||||||
z=z1-p1/pp;
|
|
||||||
} while (fabs(z-z1) > EPS2);
|
|
||||||
x[i]=xm-xl*z;
|
|
||||||
x[n+1-i]=xm+xl*z;
|
|
||||||
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
|
|
||||||
w[n+1-i]=w[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
|
|
||||||
{
|
|
||||||
double s=0.0;
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define SW_DIVS 23
|
|
||||||
double x[SW_DIVS+1], w[SW_DIVS+1];
|
|
||||||
#else
|
|
||||||
double x[NptGQ+1], w[NptGQ+1];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
gauleg(a2, b2, x, w, NptGQ);
|
|
||||||
|
|
||||||
for (int j=1; j<=NptGQ; j++) {
|
|
||||||
s += w[j]*func(x[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static double swit_(double wvnmb)
|
|
||||||
{
|
|
||||||
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
|
|
||||||
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t sw_(int nnounce, int divs)
|
|
||||||
{
|
|
||||||
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
|
|
||||||
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
|
|
||||||
}
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
// Copyright (c) 2014 The Magi developers
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
#ifndef MAGI_MATH_H
|
|
||||||
#define MAGI_MATH_H
|
|
||||||
|
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
uint32_t sw_(int nnounce, int divs);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
inline double exp_n(double xt)
|
|
||||||
{
|
|
||||||
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
|
|
||||||
if(xt < p1)
|
|
||||||
return 0;
|
|
||||||
else if(xt > p6)
|
|
||||||
return 1e200;
|
|
||||||
else if(xt > p3 && xt < p4)
|
|
||||||
return (1.0 + xt);
|
|
||||||
else
|
|
||||||
return exp(xt);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1 / (1 + exp(x1-x2))
|
|
||||||
inline double exp_n2(double x1, double x2)
|
|
||||||
{
|
|
||||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
|
|
||||||
double xt = x1 - x2;
|
|
||||||
if (xt < p1+1.e-200)
|
|
||||||
return 1.;
|
|
||||||
else if (xt > p1 && xt < p2 + 1.e-200)
|
|
||||||
return ( 1. - exp(xt) );
|
|
||||||
else if (xt > p2 && xt < p3 + 1.e-200)
|
|
||||||
return ( 1. / (1. + exp(xt)) );
|
|
||||||
else if (xt > p3 && xt < p4)
|
|
||||||
return ( 1. / (2. + xt) );
|
|
||||||
else if (xt > p4 - 1.e-200 && xt < p5)
|
|
||||||
return ( exp(-xt) / (1. + exp(-xt)) );
|
|
||||||
else if (xt > p5 - 1.e-200 && xt < p6)
|
|
||||||
return ( exp(-xt) );
|
|
||||||
else //if (xt > p6 - 1.e-200)
|
|
||||||
return 0.;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define NIST5_8WAY 1
|
#define NIST5_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define NIST5_4WAY 1
|
#define NIST5_4WAY 1
|
||||||
|
|||||||
@@ -71,8 +71,7 @@ do { \
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GAMMA_4W(n0, n1, n2, n4) \
|
#define GAMMA_4W(n0, n1, n2, n4) \
|
||||||
(g ## n0 = v128_xor( a ## n0, \
|
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
|
||||||
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
|
|
||||||
|
|
||||||
#define PI_ALL_4W do { \
|
#define PI_ALL_4W do { \
|
||||||
a0 = g0; \
|
a0 = g0; \
|
||||||
@@ -312,7 +311,7 @@ do { \
|
|||||||
BUPDATE1_8W( 7, 1 ); \
|
BUPDATE1_8W( 7, 1 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||||
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||||
|
|||||||
@@ -18,11 +18,14 @@ typedef struct {
|
|||||||
} panama_4way_context __attribute__ ((aligned (64)));
|
} panama_4way_context __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
void panama_4way_init( void *cc );
|
void panama_4way_init( void *cc );
|
||||||
|
|
||||||
void panama_4way_update( void *cc, const void *data, size_t len );
|
void panama_4way_update( void *cc, const void *data, size_t len );
|
||||||
|
|
||||||
void panama_4way_close( void *cc, void *dst );
|
void panama_4way_close( void *cc, void *dst );
|
||||||
|
|
||||||
|
#define panama_4x32_context panama_4way_context
|
||||||
|
#define panama_4x32_init panama_4way_init
|
||||||
|
#define panama_4x32_update panama_4way_update
|
||||||
|
#define panama_4x32_close panama_4way_close
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -34,10 +37,13 @@ typedef struct {
|
|||||||
} panama_8way_context __attribute__ ((aligned (128)));
|
} panama_8way_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
void panama_8way_init( void *cc );
|
void panama_8way_init( void *cc );
|
||||||
|
|
||||||
void panama_8way_update( void *cc, const void *data, size_t len );
|
void panama_8way_update( void *cc, const void *data, size_t len );
|
||||||
|
|
||||||
void panama_8way_close( void *cc, void *dst );
|
void panama_8way_close( void *cc, void *dst );
|
||||||
|
|
||||||
|
#define panama_8x32_context panama_8way_context
|
||||||
|
#define panama_8x32_init panama_8way_init
|
||||||
|
#define panama_8x32_update panama_8way_update
|
||||||
|
#define panama_8x32_close panama_8way_close
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define ANIME_8WAY 1
|
#define ANIME_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define ANIME_4WAY 1
|
#define ANIME_4WAY 1
|
||||||
|
|||||||
@@ -11,7 +11,6 @@
|
|||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cube-hash-2way.h"
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/simd/nist.h"
|
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/shavite/shavite-hash-2way.h"
|
#include "algo/shavite/shavite-hash-2way.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
@@ -32,20 +31,20 @@
|
|||||||
|
|
||||||
union _hmq1725_8way_context_overlay
|
union _hmq1725_8way_context_overlay
|
||||||
{
|
{
|
||||||
blake512_8way_context blake;
|
blake512_8x64_context blake;
|
||||||
bmw512_8way_context bmw;
|
bmw512_8x64_context bmw;
|
||||||
skein512_8way_context skein;
|
skein512_8x64_context skein;
|
||||||
jh512_8way_context jh;
|
jh512_8x64_context jh;
|
||||||
keccak512_8way_context keccak;
|
keccak512_8x64_context keccak;
|
||||||
luffa_4way_context luffa;
|
luffa_4way_context luffa;
|
||||||
cube_4way_context cube;
|
cube_4way_context cube;
|
||||||
simd_4way_context simd;
|
simd_4way_context simd;
|
||||||
hamsi512_8way_context hamsi;
|
hamsi512_8x64_context hamsi;
|
||||||
hashState_fugue fugue;
|
hashState_fugue fugue;
|
||||||
shabal512_8way_context shabal;
|
shabal512_8x32_context shabal;
|
||||||
sph_whirlpool_context whirlpool;
|
sph_whirlpool_context whirlpool;
|
||||||
sha512_8way_context sha512;
|
sha512_8x64_context sha512;
|
||||||
haval256_5_8way_context haval;
|
haval256_8x32_context haval;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl512_4way_context groestl;
|
groestl512_4way_context groestl;
|
||||||
shavite512_4way_context shavite;
|
shavite512_4way_context shavite;
|
||||||
@@ -82,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
__m512i* vhB = (__m512i*)vhashB;
|
__m512i* vhB = (__m512i*)vhashB;
|
||||||
__m512i* vhC = (__m512i*)vhashC;
|
__m512i* vhC = (__m512i*)vhashC;
|
||||||
|
|
||||||
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
|
bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );
|
||||||
|
|
||||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash );
|
hash4, hash5, hash6, hash7, vhash );
|
||||||
@@ -142,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
// B
|
// B
|
||||||
if ( likely( vh_mask & 0xff ) )
|
if ( likely( vh_mask & 0xff ) )
|
||||||
skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
|
skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );
|
||||||
|
|
||||||
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
|
||||||
|
|
||||||
jh512_8way_init( &ctx.jh );
|
jh512_8x64_init( &ctx.jh );
|
||||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
jh512_8x64_update( &ctx.jh, vhash, 64 );
|
||||||
jh512_8way_close( &ctx.jh, vhash );
|
jh512_8x64_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
keccak512_8way_init( &ctx.keccak );
|
keccak512_8x64_init( &ctx.keccak );
|
||||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
|
||||||
keccak512_8way_close( &ctx.keccak, vhash );
|
keccak512_8x64_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||||
|
|
||||||
// A
|
// A
|
||||||
if ( ( vh_mask & 0xff ) != 0xff )
|
if ( ( vh_mask & 0xff ) != 0xff )
|
||||||
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
|
blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
|
||||||
// B
|
// B
|
||||||
if ( vh_mask & 0xff )
|
if ( vh_mask & 0xff )
|
||||||
bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
|
bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );
|
||||||
|
|
||||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||||
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
|
||||||
@@ -177,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
|
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
|
||||||
{
|
{
|
||||||
keccak512_8way_init( &ctx.keccak );
|
keccak512_8x64_init( &ctx.keccak );
|
||||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
|
||||||
keccak512_8way_close( &ctx.keccak, vhashA );
|
keccak512_8x64_close( &ctx.keccak, vhashA );
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( likely( vh_mask & 0xff ) )
|
if ( likely( vh_mask & 0xff ) )
|
||||||
{
|
{
|
||||||
jh512_8way_init( &ctx.jh );
|
jh512_8x64_init( &ctx.jh );
|
||||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
jh512_8x64_update( &ctx.jh, vhash, 64 );
|
||||||
jh512_8way_close( &ctx.jh, vhashB );
|
jh512_8x64_close( &ctx.jh, vhashB );
|
||||||
}
|
}
|
||||||
|
|
||||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||||
@@ -252,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
// B
|
// B
|
||||||
if ( likely( vh_mask & 0xff ) )
|
if ( likely( vh_mask & 0xff ) )
|
||||||
{
|
{
|
||||||
haval256_5_8way_init( &ctx.haval );
|
haval256_8x32_init( &ctx.haval );
|
||||||
haval256_5_8way_update( &ctx.haval, vhash, 64 );
|
haval256_8x32_update( &ctx.haval, vhash, 64 );
|
||||||
haval256_5_8way_close( &ctx.haval, vhash );
|
haval256_8x32_close( &ctx.haval, vhash );
|
||||||
memset( &vhash[8<<3], 0, 32<<3 );
|
memset( &vhash[8<<3], 0, 32<<3 );
|
||||||
rintrlv_8x32_8x64( vhashB, vhash, 512 );
|
rintrlv_8x32_8x64( vhashB, vhash, 512 );
|
||||||
}
|
}
|
||||||
@@ -297,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
|
blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );
|
||||||
|
|
||||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||||
|
|
||||||
@@ -352,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
|
||||||
|
|
||||||
hamsi512_8way_init( &ctx.hamsi );
|
hamsi512_8x64_init( &ctx.hamsi );
|
||||||
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
|
||||||
hamsi512_8way_close( &ctx.hamsi, vhash );
|
hamsi512_8x64_close( &ctx.hamsi, vhash );
|
||||||
|
|
||||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash );
|
hash4, hash5, hash6, hash7, vhash );
|
||||||
@@ -430,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
rintrlv_8x64_8x32( vhashA, vhash, 512 );
|
rintrlv_8x64_8x32( vhashA, vhash, 512 );
|
||||||
|
|
||||||
shabal512_8way_init( &ctx.shabal );
|
shabal512_8x32_init( &ctx.shabal );
|
||||||
shabal512_8way_update( &ctx.shabal, vhashA, 64 );
|
shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
|
||||||
shabal512_8way_close( &ctx.shabal, vhash );
|
shabal512_8x32_close( &ctx.shabal, vhash );
|
||||||
|
|
||||||
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
|
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
|
||||||
hash4, hash5, hash6, hash7, vhash );
|
hash4, hash5, hash6, hash7, vhash );
|
||||||
@@ -475,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
// B
|
// B
|
||||||
if ( likely( vh_mask & 0xff ) )
|
if ( likely( vh_mask & 0xff ) )
|
||||||
{
|
{
|
||||||
sha512_8way_init( &ctx.sha512 );
|
sha512_8x64_init( &ctx.sha512 );
|
||||||
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
sha512_8x64_update( &ctx.sha512, vhash, 64 );
|
||||||
sha512_8way_close( &ctx.sha512, vhashB );
|
sha512_8x64_close( &ctx.sha512, vhashB );
|
||||||
}
|
}
|
||||||
|
|
||||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||||
@@ -510,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sha512_8way_init( &ctx.sha512 );
|
sha512_8x64_init( &ctx.sha512 );
|
||||||
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
sha512_8x64_update( &ctx.sha512, vhash, 64 );
|
||||||
sha512_8way_close( &ctx.sha512, vhash );
|
sha512_8x64_close( &ctx.sha512, vhash );
|
||||||
|
|
||||||
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
|
||||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
|
||||||
@@ -523,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
{
|
{
|
||||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
hash7 );
|
hash7 );
|
||||||
haval256_5_8way_init( &ctx.haval );
|
haval256_8x32_init( &ctx.haval );
|
||||||
haval256_5_8way_update( &ctx.haval, vhash, 64 );
|
haval256_8x32_update( &ctx.haval, vhash, 64 );
|
||||||
haval256_5_8way_close( &ctx.haval, vhash );
|
haval256_8x32_close( &ctx.haval, vhash );
|
||||||
memset( &vhash[8<<3], 0, 32<<3 );
|
memset( &vhash[8<<3], 0, 32<<3 );
|
||||||
rintrlv_8x32_8x64( vhashA, vhash, 512 );
|
rintrlv_8x32_8x64( vhashA, vhash, 512 );
|
||||||
}
|
}
|
||||||
@@ -552,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
|||||||
hash7 );
|
hash7 );
|
||||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||||
|
|
||||||
bmw512_8way_init( &ctx.bmw );
|
bmw512_8x64_init( &ctx.bmw );
|
||||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
bmw512_8x64_update( &ctx.bmw, vhash, 64 );
|
||||||
bmw512_8way_close( &ctx.bmw, state );
|
bmw512_8x64_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -606,27 +605,27 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
union _hmq1725_4way_context_overlay
|
union _hmq1725_4way_context_overlay
|
||||||
{
|
{
|
||||||
blake512_4way_context blake;
|
blake512_4x64_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4x64_context bmw;
|
||||||
hashState_groestl groestl;
|
hashState_groestl groestl;
|
||||||
skein512_4way_context skein;
|
skein512_4x64_context skein;
|
||||||
jh512_4way_context jh;
|
jh512_4x64_context jh;
|
||||||
keccak512_4way_context keccak;
|
keccak512_4x64_context keccak;
|
||||||
hashState_luffa luffa;
|
hashState_luffa luffa;
|
||||||
luffa_2way_context luffa2;
|
luffa_2way_context luffa2;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
cube_2way_context cube2;
|
cube_2way_context cube2;
|
||||||
sph_shavite512_context shavite;
|
sph_shavite512_context shavite;
|
||||||
hashState_sd sd;
|
simd512_context simd;
|
||||||
shavite512_2way_context shavite2;
|
shavite512_2way_context shavite2;
|
||||||
simd_2way_context simd;
|
simd_2way_context simd_2way;
|
||||||
hashState_echo echo;
|
hashState_echo echo;
|
||||||
hamsi512_4way_context hamsi;
|
hamsi512_4x64_context hamsi;
|
||||||
hashState_fugue fugue;
|
hashState_fugue fugue;
|
||||||
shabal512_4way_context shabal;
|
shabal512_4x32_context shabal;
|
||||||
sph_whirlpool_context whirlpool;
|
sph_whirlpool_context whirlpool;
|
||||||
sha512_4way_context sha512;
|
sha512_4x64_context sha512;
|
||||||
haval256_5_4way_context haval;
|
haval256_4x32_context haval;
|
||||||
#if defined(__VAES__)
|
#if defined(__VAES__)
|
||||||
groestl512_2way_context groestl2;
|
groestl512_2way_context groestl2;
|
||||||
echo_2way_context echo2;
|
echo_2way_context echo2;
|
||||||
@@ -653,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
__m256i* vhA = (__m256i*)vhashA;
|
__m256i* vhA = (__m256i*)vhashA;
|
||||||
__m256i* vhB = (__m256i*)vhashB;
|
__m256i* vhB = (__m256i*)vhashB;
|
||||||
|
|
||||||
bmw512_4way_init( &ctx.bmw );
|
bmw512_4x64_init( &ctx.bmw );
|
||||||
bmw512_4way_update( &ctx.bmw, input, 80 );
|
bmw512_4x64_update( &ctx.bmw, input, 80 );
|
||||||
bmw512_4way_close( &ctx.bmw, vhash );
|
bmw512_4x64_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
@@ -687,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
// B
|
// B
|
||||||
|
|
||||||
if ( h_mask & 0xffffffff )
|
if ( h_mask & 0xffffffff )
|
||||||
skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
|
skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );
|
||||||
|
|
||||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||||
|
|
||||||
jh512_4way_init( &ctx.jh );
|
jh512_4x64_init( &ctx.jh );
|
||||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
jh512_4x64_update( &ctx.jh, vhash, 64 );
|
||||||
jh512_4way_close( &ctx.jh, vhash );
|
jh512_4x64_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
keccak512_4way_init( &ctx.keccak );
|
keccak512_4x64_init( &ctx.keccak );
|
||||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
|
||||||
keccak512_4way_close( &ctx.keccak, vhash );
|
keccak512_4x64_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
// second fork, A = blake parallel, B= bmw parallel.
|
// second fork, A = blake parallel, B= bmw parallel.
|
||||||
|
|
||||||
@@ -705,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
h_mask = _mm256_movemask_epi8( vh_mask );
|
h_mask = _mm256_movemask_epi8( vh_mask );
|
||||||
|
|
||||||
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
||||||
blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
|
blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );
|
||||||
|
|
||||||
if ( h_mask & 0xffffffff )
|
if ( h_mask & 0xffffffff )
|
||||||
{
|
{
|
||||||
bmw512_4way_init( &ctx.bmw );
|
bmw512_4x64_init( &ctx.bmw );
|
||||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
|
||||||
bmw512_4way_close( &ctx.bmw, vhashB );
|
bmw512_4x64_close( &ctx.bmw, vhashB );
|
||||||
}
|
}
|
||||||
|
|
||||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||||
@@ -734,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
||||||
{
|
{
|
||||||
keccak512_4way_init( &ctx.keccak );
|
keccak512_4x64_init( &ctx.keccak );
|
||||||
keccak512_4way_update( &ctx.keccak, vhash, 64 );
|
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
|
||||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
keccak512_4x64_close( &ctx.keccak, vhashA );
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( h_mask & 0xffffffff )
|
if ( h_mask & 0xffffffff )
|
||||||
{
|
{
|
||||||
jh512_4way_init( &ctx.jh );
|
jh512_4x64_init( &ctx.jh );
|
||||||
jh512_4way_update( &ctx.jh, vhash, 64 );
|
jh512_4x64_update( &ctx.jh, vhash, 64 );
|
||||||
jh512_4way_close( &ctx.jh, vhashB );
|
jh512_4x64_close( &ctx.jh, vhashB );
|
||||||
}
|
}
|
||||||
|
|
||||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||||
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
|
||||||
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
|
||||||
|
|
||||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
|
||||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
|
||||||
|
|
||||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||||
|
|
||||||
@@ -779,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
// B
|
// B
|
||||||
if ( h_mask & 0xffffffff )
|
if ( h_mask & 0xffffffff )
|
||||||
{
|
{
|
||||||
haval256_5_4way_init( &ctx.haval );
|
haval256_4x32_init( &ctx.haval );
|
||||||
haval256_5_4way_update( &ctx.haval, vhash, 64 );
|
haval256_4x32_update( &ctx.haval, vhash, 64 );
|
||||||
haval256_5_4way_close( &ctx.haval, vhash );
|
haval256_4x32_close( &ctx.haval, vhash );
|
||||||
memset( &vhash[8<<2], 0, 32<<2 );
|
memset( &vhash[8<<2], 0, 32<<2 );
|
||||||
rintrlv_4x32_4x64( vhashB, vhash, 512 );
|
rintrlv_4x32_4x64( vhashB, vhash, 512 );
|
||||||
}
|
}
|
||||||
@@ -814,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
|
blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
@@ -846,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
|
|
||||||
hamsi512_4way_init( &ctx.hamsi );
|
hamsi512_4x64_init( &ctx.hamsi );
|
||||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
|
||||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
hamsi512_4x64_close( &ctx.hamsi, vhash );
|
||||||
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
@@ -869,47 +868,31 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||||
(const BitSequence *)hash0, 64 );
|
(const BitSequence *)hash0, 64 );
|
||||||
else
|
else
|
||||||
{
|
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||||
init_sd( &ctx.sd, 512 );
|
|
||||||
update_final_sd( &ctx.sd, (BitSequence *)hash0,
|
|
||||||
(const BitSequence *)hash0, 512 );
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( hash1[0] & mask ) //4
|
if ( hash1[0] & mask ) //4
|
||||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||||
(const BitSequence *)hash1, 64 );
|
(const BitSequence *)hash1, 64 );
|
||||||
else
|
else
|
||||||
{
|
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||||
init_sd( &ctx.sd, 512 );
|
|
||||||
update_final_sd( &ctx.sd, (BitSequence *)hash1,
|
|
||||||
(const BitSequence *)hash1, 512 );
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( hash2[0] & mask ) //4
|
if ( hash2[0] & mask ) //4
|
||||||
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
|
||||||
(const BitSequence *)hash2, 64 );
|
(const BitSequence *)hash2, 64 );
|
||||||
else
|
else
|
||||||
{
|
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
|
||||||
init_sd( &ctx.sd, 512 );
|
|
||||||
update_final_sd( &ctx.sd, (BitSequence *)hash2,
|
|
||||||
(const BitSequence *)hash2, 512 );
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( hash3[0] & mask ) //4
|
if ( hash3[0] & mask ) //4
|
||||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||||
(const BitSequence *)hash3, 64 );
|
(const BitSequence *)hash3, 64 );
|
||||||
else
|
else
|
||||||
{
|
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
|
||||||
init_sd( &ctx.sd, 512 );
|
|
||||||
update_final_sd( &ctx.sd, (BitSequence *)hash3,
|
|
||||||
(const BitSequence *)hash3, 512 );
|
|
||||||
}
|
|
||||||
|
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
|
|
||||||
shabal512_4way_init( &ctx.shabal );
|
shabal512_4x32_init( &ctx.shabal );
|
||||||
shabal512_4way_update( &ctx.shabal, vhash, 64 );
|
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
|
||||||
shabal512_4way_close( &ctx.shabal, vhash );
|
shabal512_4x32_close( &ctx.shabal, vhash );
|
||||||
|
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
@@ -938,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
if ( h_mask & 0xffffffff )
|
if ( h_mask & 0xffffffff )
|
||||||
{
|
{
|
||||||
sha512_4way_init( &ctx.sha512 );
|
sha512_4x64_init( &ctx.sha512 );
|
||||||
sha512_4way_update( &ctx.sha512, vhash, 64 );
|
sha512_4x64_update( &ctx.sha512, vhash, 64 );
|
||||||
sha512_4way_close( &ctx.sha512, vhashB );
|
sha512_4x64_close( &ctx.sha512, vhashB );
|
||||||
}
|
}
|
||||||
|
|
||||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||||
@@ -967,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sha512_4way_init( &ctx.sha512 );
|
sha512_4x64_init( &ctx.sha512 );
|
||||||
sha512_4way_update( &ctx.sha512, vhash, 64 );
|
sha512_4x64_update( &ctx.sha512, vhash, 64 );
|
||||||
sha512_4way_close( &ctx.sha512, vhash );
|
sha512_4x64_close( &ctx.sha512, vhash );
|
||||||
|
|
||||||
// A = haval parallel, B = Whirlpool serial
|
// A = haval parallel, B = Whirlpool serial
|
||||||
|
|
||||||
@@ -981,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
|
||||||
{
|
{
|
||||||
haval256_5_4way_init( &ctx.haval );
|
haval256_4x32_init( &ctx.haval );
|
||||||
haval256_5_4way_update( &ctx.haval, vhash, 64 );
|
haval256_4x32_update( &ctx.haval, vhash, 64 );
|
||||||
haval256_5_4way_close( &ctx.haval, vhash );
|
haval256_4x32_close( &ctx.haval, vhash );
|
||||||
memset( &vhash[8<<2], 0, 32<<2 );
|
memset( &vhash[8<<2], 0, 32<<2 );
|
||||||
rintrlv_4x32_4x64( vhashA, vhash, 512 );
|
rintrlv_4x32_4x64( vhashA, vhash, 512 );
|
||||||
}
|
}
|
||||||
@@ -1001,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
|||||||
|
|
||||||
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
|
||||||
|
|
||||||
bmw512_4way_init( &ctx.bmw );
|
bmw512_4x64_init( &ctx.bmw );
|
||||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
|
||||||
bmw512_4way_close( &ctx.bmw, state );
|
bmw512_4x64_close( &ctx.bmw, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define HMQ1725_8WAY 1
|
#define HMQ1725_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define HMQ1725_4WAY 1
|
#define HMQ1725_4WAY 1
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define QUARK_8WAY 1
|
#define QUARK_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define QUARK_4WAY 1
|
#define QUARK_4WAY 1
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define QUBIT_4WAY 1
|
#define QUBIT_4WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define QUBIT_2WAY 1
|
#define QUBIT_2WAY 1
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
#if defined(LBRY_16WAY)
|
#if defined(LBRY_16WAY)
|
||||||
|
|
||||||
static __thread sha256_16way_context sha256_16w_mid;
|
static __thread sha256_16x32_context sha256_16w_mid;
|
||||||
|
|
||||||
void lbry_16way_hash( void* output, const void* input )
|
void lbry_16way_hash( void* output, const void* input )
|
||||||
{
|
{
|
||||||
@@ -36,17 +36,17 @@ void lbry_16way_hash( void* output, const void* input )
|
|||||||
uint32_t _ALIGN(64) h13[32];
|
uint32_t _ALIGN(64) h13[32];
|
||||||
uint32_t _ALIGN(64) h14[32];
|
uint32_t _ALIGN(64) h14[32];
|
||||||
uint32_t _ALIGN(64) h15[32];
|
uint32_t _ALIGN(64) h15[32];
|
||||||
sha256_16way_context ctx_sha256 __attribute__ ((aligned (64)));
|
sha256_16x32_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||||
sha512_8way_context ctx_sha512;
|
sha512_8x64_context ctx_sha512;
|
||||||
ripemd160_16way_context ctx_ripemd;
|
ripemd160_16x32_context ctx_ripemd;
|
||||||
|
|
||||||
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
|
memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
|
||||||
sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
|
sha256_16x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
|
||||||
sha256_16way_close( &ctx_sha256, vhashA );
|
sha256_16x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
sha256_16way_init( &ctx_sha256 );
|
sha256_16x32_init( &ctx_sha256 );
|
||||||
sha256_16way_update( &ctx_sha256, vhashA, 32 );
|
sha256_16x32_update( &ctx_sha256, vhashA, 32 );
|
||||||
sha256_16way_close( &ctx_sha256, vhashA );
|
sha256_16x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
// reinterleave to do sha512 4-way 64 bit twice.
|
// reinterleave to do sha512 4-way 64 bit twice.
|
||||||
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
|
dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
|
||||||
@@ -54,13 +54,13 @@ void lbry_16way_hash( void* output, const void* input )
|
|||||||
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
|
intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
|
||||||
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
|
intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
|
||||||
|
|
||||||
sha512_8way_init( &ctx_sha512 );
|
sha512_8x64_init( &ctx_sha512 );
|
||||||
sha512_8way_update( &ctx_sha512, vhashA, 32 );
|
sha512_8x64_update( &ctx_sha512, vhashA, 32 );
|
||||||
sha512_8way_close( &ctx_sha512, vhashA );
|
sha512_8x64_close( &ctx_sha512, vhashA );
|
||||||
|
|
||||||
sha512_8way_init( &ctx_sha512 );
|
sha512_8x64_init( &ctx_sha512 );
|
||||||
sha512_8way_update( &ctx_sha512, vhashB, 32 );
|
sha512_8x64_update( &ctx_sha512, vhashB, 32 );
|
||||||
sha512_8way_close( &ctx_sha512, vhashB );
|
sha512_8x64_close( &ctx_sha512, vhashB );
|
||||||
|
|
||||||
// back to 8-way 32 bit
|
// back to 8-way 32 bit
|
||||||
dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
|
dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
|
||||||
@@ -68,22 +68,22 @@ void lbry_16way_hash( void* output, const void* input )
|
|||||||
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
|
intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
|
||||||
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
|
h8, h9, h10, h11, h12, h13, h14, h15, 512 );
|
||||||
|
|
||||||
ripemd160_16way_init( &ctx_ripemd );
|
ripemd160_16x32_init( &ctx_ripemd );
|
||||||
ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
|
ripemd160_16x32_update( &ctx_ripemd, vhashA, 32 );
|
||||||
ripemd160_16way_close( &ctx_ripemd, vhashB );
|
ripemd160_16x32_close( &ctx_ripemd, vhashB );
|
||||||
|
|
||||||
ripemd160_16way_init( &ctx_ripemd );
|
ripemd160_16x32_init( &ctx_ripemd );
|
||||||
ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
|
ripemd160_16x32_update( &ctx_ripemd, vhashA+(8<<4), 32 );
|
||||||
ripemd160_16way_close( &ctx_ripemd, vhashC );
|
ripemd160_16x32_close( &ctx_ripemd, vhashC );
|
||||||
|
|
||||||
sha256_16way_init( &ctx_sha256 );
|
sha256_16x32_init( &ctx_sha256 );
|
||||||
sha256_16way_update( &ctx_sha256, vhashB, 20 );
|
sha256_16x32_update( &ctx_sha256, vhashB, 20 );
|
||||||
sha256_16way_update( &ctx_sha256, vhashC, 20 );
|
sha256_16x32_update( &ctx_sha256, vhashC, 20 );
|
||||||
sha256_16way_close( &ctx_sha256, vhashA );
|
sha256_16x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
sha256_16way_init( &ctx_sha256 );
|
sha256_16x32_init( &ctx_sha256 );
|
||||||
sha256_16way_update( &ctx_sha256, vhashA, 32 );
|
sha256_16x32_update( &ctx_sha256, vhashA, 32 );
|
||||||
sha256_16way_close( &ctx_sha256, output );
|
sha256_16x32_close( &ctx_sha256, output );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -104,19 +104,19 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
|||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
|
|
||||||
// we need bigendian data...
|
// we need bigendian data...
|
||||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||||
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
||||||
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
||||||
|
|
||||||
sha256_16way_init( &sha256_16w_mid );
|
sha256_16x32_init( &sha256_16w_mid );
|
||||||
sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
|
sha256_16x32_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -144,7 +144,7 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#elif defined(LBRY_8WAY)
|
#elif defined(LBRY_8WAY)
|
||||||
|
|
||||||
static __thread sha256_8way_context sha256_8w_mid;
|
static __thread sha256_8x32_context sha256_8w_mid;
|
||||||
|
|
||||||
void lbry_8way_hash( void* output, const void* input )
|
void lbry_8way_hash( void* output, const void* input )
|
||||||
{
|
{
|
||||||
@@ -159,52 +159,52 @@ void lbry_8way_hash( void* output, const void* input )
|
|||||||
uint32_t _ALIGN(32) h5[32];
|
uint32_t _ALIGN(32) h5[32];
|
||||||
uint32_t _ALIGN(32) h6[32];
|
uint32_t _ALIGN(32) h6[32];
|
||||||
uint32_t _ALIGN(32) h7[32];
|
uint32_t _ALIGN(32) h7[32];
|
||||||
sha256_8way_context ctx_sha256 __attribute__ ((aligned (64)));
|
sha256_8x32_context ctx_sha256 __attribute__ ((aligned (64)));
|
||||||
sha512_4way_context ctx_sha512;
|
sha512_4x64_context ctx_sha512;
|
||||||
ripemd160_8way_context ctx_ripemd;
|
ripemd160_8x32_context ctx_ripemd;
|
||||||
|
|
||||||
memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
|
memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
|
||||||
sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
|
sha256_8x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
|
||||||
sha256_8way_close( &ctx_sha256, vhashA );
|
sha256_8x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
sha256_8way_init( &ctx_sha256 );
|
sha256_8x32_init( &ctx_sha256 );
|
||||||
sha256_8way_update( &ctx_sha256, vhashA, 32 );
|
sha256_8x32_update( &ctx_sha256, vhashA, 32 );
|
||||||
sha256_8way_close( &ctx_sha256, vhashA );
|
sha256_8x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
// reinterleave to do sha512 4-way 64 bit twice.
|
// reinterleave to do sha512 4-way 64 bit twice.
|
||||||
dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
|
dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
|
||||||
intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
|
intrlv_4x64( vhashA, h0, h1, h2, h3, 256 );
|
||||||
intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
|
intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
|
||||||
|
|
||||||
sha512_4way_init( &ctx_sha512 );
|
sha512_4x64_init( &ctx_sha512 );
|
||||||
sha512_4way_update( &ctx_sha512, vhashA, 32 );
|
sha512_4x64_update( &ctx_sha512, vhashA, 32 );
|
||||||
sha512_4way_close( &ctx_sha512, vhashA );
|
sha512_4x64_close( &ctx_sha512, vhashA );
|
||||||
|
|
||||||
sha512_4way_init( &ctx_sha512 );
|
sha512_4x64_init( &ctx_sha512 );
|
||||||
sha512_4way_update( &ctx_sha512, vhashB, 32 );
|
sha512_4x64_update( &ctx_sha512, vhashB, 32 );
|
||||||
sha512_4way_close( &ctx_sha512, vhashB );
|
sha512_4x64_close( &ctx_sha512, vhashB );
|
||||||
|
|
||||||
// back to 8-way 32 bit
|
// back to 8-way 32 bit
|
||||||
dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
|
dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 );
|
||||||
dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
|
dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 );
|
||||||
intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
|
intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
|
||||||
|
|
||||||
ripemd160_8way_init( &ctx_ripemd );
|
ripemd160_8x32_init( &ctx_ripemd );
|
||||||
ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
|
ripemd160_8x32_update( &ctx_ripemd, vhashA, 32 );
|
||||||
ripemd160_8way_close( &ctx_ripemd, vhashB );
|
ripemd160_8x32_close( &ctx_ripemd, vhashB );
|
||||||
|
|
||||||
ripemd160_8way_init( &ctx_ripemd );
|
ripemd160_8x32_init( &ctx_ripemd );
|
||||||
ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
|
ripemd160_8x32_update( &ctx_ripemd, vhashA+(8<<3), 32 );
|
||||||
ripemd160_8way_close( &ctx_ripemd, vhashC );
|
ripemd160_8x32_close( &ctx_ripemd, vhashC );
|
||||||
|
|
||||||
sha256_8way_init( &ctx_sha256 );
|
sha256_8x32_init( &ctx_sha256 );
|
||||||
sha256_8way_update( &ctx_sha256, vhashB, 20 );
|
sha256_8x32_update( &ctx_sha256, vhashB, 20 );
|
||||||
sha256_8way_update( &ctx_sha256, vhashC, 20 );
|
sha256_8x32_update( &ctx_sha256, vhashC, 20 );
|
||||||
sha256_8way_close( &ctx_sha256, vhashA );
|
sha256_8x32_close( &ctx_sha256, vhashA );
|
||||||
|
|
||||||
sha256_8way_init( &ctx_sha256 );
|
sha256_8x32_init( &ctx_sha256 );
|
||||||
sha256_8way_update( &ctx_sha256, vhashA, 32 );
|
sha256_8x32_update( &ctx_sha256, vhashA, 32 );
|
||||||
sha256_8way_close( &ctx_sha256, output );
|
sha256_8x32_close( &ctx_sha256, output );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -224,19 +224,19 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
|||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
|
|
||||||
// we need bigendian data...
|
// we need bigendian data...
|
||||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||||
intrlv_8x32( vdata, edata, edata, edata, edata,
|
intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||||
edata, edata, edata, edata, 1024 );
|
edata, edata, edata, edata, 1024 );
|
||||||
|
|
||||||
sha256_8way_init( &sha256_8w_mid );
|
sha256_8x32_init( &sha256_8w_mid );
|
||||||
sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
|
sha256_8x32_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
|||||||
|
|
||||||
bool register_lbry_algo( algo_gate_t* gate )
|
bool register_lbry_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
|
||||||
#if defined (LBRY_16WAY)
|
#if defined (LBRY_16WAY)
|
||||||
gate->scanhash = (void*)&scanhash_lbry_16way;
|
gate->scanhash = (void*)&scanhash_lbry_16way;
|
||||||
gate->hash = (void*)&lbry_16way_hash;
|
gate->hash = (void*)&lbry_16way_hash;
|
||||||
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
|||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_lbry;
|
gate->scanhash = (void*)&scanhash_lbry;
|
||||||
gate->hash = (void*)&lbry_hash;
|
gate->hash = (void*)&lbry_hash;
|
||||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
|
||||||
#endif
|
#endif
|
||||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LBRY_16WAY 1
|
#define LBRY_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LBRY_8WAY 1
|
#define LBRY_8WAY 1
|
||||||
|
|||||||
@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
|
|||||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
||||||
|
|
||||||
#define F3(x, y, z) \
|
#define F3(x, y, z) \
|
||||||
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
|
_mm_xor_si128( v128_ornot( y, x ), z )
|
||||||
|
|
||||||
#define F4(x, y, z) \
|
#define F4(x, y, z) \
|
||||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
||||||
|
|
||||||
#define F5(x, y, z) \
|
#define F5(x, y, z) \
|
||||||
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
|
_mm_xor_si128( x, v128_ornot( z, y ) )
|
||||||
|
|
||||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||||
do{ \
|
do{ \
|
||||||
@@ -57,7 +57,7 @@ do{ \
|
|||||||
#define ROUND2(a, b, c, d, e, f, s, r, k) \
|
#define ROUND2(a, b, c, d, e, f, s, r, k) \
|
||||||
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
||||||
|
|
||||||
static void ripemd160_4way_round( ripemd160_4way_context *sc )
|
static void ripemd160_4x32_round( ripemd160_4x32_context *sc )
|
||||||
{
|
{
|
||||||
const __m128i *in = (__m128i*)sc->buf;
|
const __m128i *in = (__m128i*)sc->buf;
|
||||||
__m128i *h = (__m128i*)sc->val;
|
__m128i *h = (__m128i*)sc->val;
|
||||||
@@ -249,7 +249,7 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
|
|||||||
h[0] = tmp;
|
h[0] = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_4way_init( ripemd160_4way_context *sc )
|
void ripemd160_4x32_init( ripemd160_4x32_context *sc )
|
||||||
{
|
{
|
||||||
sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
|
sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
|
||||||
sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
|
sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
|
||||||
@@ -259,7 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
|
|||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m128i *vdata = (__m128i*)data;
|
__m128i *vdata = (__m128i*)data;
|
||||||
@@ -281,7 +281,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
|||||||
len -= clen;
|
len -= clen;
|
||||||
if ( ptr == block_size )
|
if ( ptr == block_size )
|
||||||
{
|
{
|
||||||
ripemd160_4way_round( sc );
|
ripemd160_4x32_round( sc );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
clow = sc->count_low;
|
clow = sc->count_low;
|
||||||
@@ -292,7 +292,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
unsigned ptr, u;
|
unsigned ptr, u;
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
@@ -306,7 +306,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
|||||||
if ( ptr > pad )
|
if ( ptr > pad )
|
||||||
{
|
{
|
||||||
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
||||||
ripemd160_4way_round( sc );
|
ripemd160_4x32_round( sc );
|
||||||
memset_zero_128( sc->buf, pad>>2 );
|
memset_zero_128( sc->buf, pad>>2 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -317,9 +317,9 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
|||||||
low = low << 3;
|
low = low << 3;
|
||||||
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
|
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
|
||||||
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
|
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
|
||||||
ripemd160_4way_round( sc );
|
ripemd160_4x32_round( sc );
|
||||||
for (u = 0; u < 5; u ++)
|
for (u = 0; u < 5; u ++)
|
||||||
casti_m128i( dst, u ) = sc->val[u];
|
casti_v128u32( dst, u ) = sc->val[u];
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
|
|||||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
|
||||||
|
|
||||||
#define F8W_3(x, y, z) \
|
#define F8W_3(x, y, z) \
|
||||||
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
|
_mm256_xor_si256( mm256_ornot( y, x ), z )
|
||||||
|
|
||||||
#define F8W_4(x, y, z) \
|
#define F8W_4(x, y, z) \
|
||||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
|
||||||
|
|
||||||
#define F8W_5(x, y, z) \
|
#define F8W_5(x, y, z) \
|
||||||
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
|
_mm256_xor_si256( x, mm256_ornot( z, y ) )
|
||||||
|
|
||||||
#define RR_8W(a, b, c, d, e, f, s, r, k) \
|
#define RR_8W(a, b, c, d, e, f, s, r, k) \
|
||||||
do{ \
|
do{ \
|
||||||
@@ -357,7 +357,7 @@ do{ \
|
|||||||
#define ROUND2_8W(a, b, c, d, e, f, s, r, k) \
|
#define ROUND2_8W(a, b, c, d, e, f, s, r, k) \
|
||||||
RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
||||||
|
|
||||||
static void ripemd160_8way_round( ripemd160_8way_context *sc )
|
static void ripemd160_8x32_round( ripemd160_8x32_context *sc )
|
||||||
{
|
{
|
||||||
const __m256i *in = (__m256i*)sc->buf;
|
const __m256i *in = (__m256i*)sc->buf;
|
||||||
__m256i *h = (__m256i*)sc->val;
|
__m256i *h = (__m256i*)sc->val;
|
||||||
@@ -550,7 +550,7 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ripemd160_8way_init( ripemd160_8way_context *sc )
|
void ripemd160_8x32_init( ripemd160_8x32_context *sc )
|
||||||
{
|
{
|
||||||
sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
|
sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
|
||||||
sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
|
sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
|
||||||
@@ -560,7 +560,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
|
|||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
@@ -582,7 +582,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
|||||||
len -= clen;
|
len -= clen;
|
||||||
if ( ptr == block_size )
|
if ( ptr == block_size )
|
||||||
{
|
{
|
||||||
ripemd160_8way_round( sc );
|
ripemd160_8x32_round( sc );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
clow = sc->count_low;
|
clow = sc->count_low;
|
||||||
@@ -593,7 +593,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
|
void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
unsigned ptr, u;
|
unsigned ptr, u;
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
@@ -607,7 +607,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
|
|||||||
if ( ptr > pad )
|
if ( ptr > pad )
|
||||||
{
|
{
|
||||||
memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
||||||
ripemd160_8way_round( sc );
|
ripemd160_8x32_round( sc );
|
||||||
memset_zero_256( sc->buf, pad>>2 );
|
memset_zero_256( sc->buf, pad>>2 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -618,18 +618,17 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
|
|||||||
low = low << 3;
|
low = low << 3;
|
||||||
sc->buf[ pad>>2 ] = _mm256_set1_epi32( low );
|
sc->buf[ pad>>2 ] = _mm256_set1_epi32( low );
|
||||||
sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
|
sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
|
||||||
ripemd160_8way_round( sc );
|
ripemd160_8x32_round( sc );
|
||||||
for (u = 0; u < 5; u ++)
|
for (u = 0; u < 5; u ++)
|
||||||
casti_m256i( dst, u ) = sc->val[u];
|
casti_m256i( dst, u ) = sc->val[u];
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// RIPEMD-160 16 way
|
// RIPEMD-160 16 way
|
||||||
|
|
||||||
|
|
||||||
#define F16W_1(x, y, z) \
|
#define F16W_1(x, y, z) \
|
||||||
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
|
_mm512_xor_si512( _mm512_xor_si512( x, y ), z )
|
||||||
|
|
||||||
@@ -659,7 +658,7 @@ do{ \
|
|||||||
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
|
#define ROUND2_16W(a, b, c, d, e, f, s, r, k) \
|
||||||
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
|
||||||
|
|
||||||
static void ripemd160_16way_round( ripemd160_16way_context *sc )
|
static void ripemd160_16x32_round( ripemd160_16x32_context *sc )
|
||||||
{
|
{
|
||||||
const __m512i *in = (__m512i*)sc->buf;
|
const __m512i *in = (__m512i*)sc->buf;
|
||||||
__m512i *h = (__m512i*)sc->val;
|
__m512i *h = (__m512i*)sc->val;
|
||||||
@@ -851,7 +850,7 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
|
|||||||
h[0] = tmp;
|
h[0] = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_16way_init( ripemd160_16way_context *sc )
|
void ripemd160_16x32_init( ripemd160_16x32_context *sc )
|
||||||
{
|
{
|
||||||
sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
|
sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
|
||||||
sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
|
sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
|
||||||
@@ -861,7 +860,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
|
|||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m512i *vdata = (__m512i*)data;
|
__m512i *vdata = (__m512i*)data;
|
||||||
@@ -883,7 +882,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
|||||||
len -= clen;
|
len -= clen;
|
||||||
if ( ptr == block_size )
|
if ( ptr == block_size )
|
||||||
{
|
{
|
||||||
ripemd160_16way_round( sc );
|
ripemd160_16x32_round( sc );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
clow = sc->count_low;
|
clow = sc->count_low;
|
||||||
@@ -894,7 +893,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
|
void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
unsigned ptr, u;
|
unsigned ptr, u;
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
@@ -908,7 +907,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
|
|||||||
if ( ptr > pad )
|
if ( ptr > pad )
|
||||||
{
|
{
|
||||||
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
|
||||||
ripemd160_16way_round( sc );
|
ripemd160_16x32_round( sc );
|
||||||
memset_zero_512( sc->buf, pad>>2 );
|
memset_zero_512( sc->buf, pad>>2 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -919,7 +918,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
|
|||||||
low = low << 3;
|
low = low << 3;
|
||||||
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
|
sc->buf[ pad>>2 ] = _mm512_set1_epi32( low );
|
||||||
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
|
sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
|
||||||
ripemd160_16way_round( sc );
|
ripemd160_16x32_round( sc );
|
||||||
for (u = 0; u < 5; u ++)
|
for (u = 0; u < 5; u ++)
|
||||||
casti_m512i( dst, u ) = sc->val[u];
|
casti_m512i( dst, u ) = sc->val[u];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,12 +12,12 @@ typedef struct
|
|||||||
__m128i buf[64>>2];
|
__m128i buf[64>>2];
|
||||||
__m128i val[5];
|
__m128i val[5];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
} __attribute__ ((aligned (64))) ripemd160_4way_context;
|
} __attribute__ ((aligned (64))) ripemd160_4x32_context;
|
||||||
|
|
||||||
void ripemd160_4way_init( ripemd160_4way_context *sc );
|
void ripemd160_4x32_init( ripemd160_4x32_context *sc );
|
||||||
void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
|
void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data,
|
||||||
size_t len );
|
size_t len );
|
||||||
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
|
void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst );
|
||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
@@ -26,26 +26,26 @@ typedef struct
|
|||||||
__m256i buf[64>>2];
|
__m256i buf[64>>2];
|
||||||
__m256i val[5];
|
__m256i val[5];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
} __attribute__ ((aligned (128))) ripemd160_8way_context;
|
} __attribute__ ((aligned (128))) ripemd160_8x32_context;
|
||||||
|
|
||||||
void ripemd160_8way_init( ripemd160_8way_context *sc );
|
void ripemd160_8x32_init( ripemd160_8x32_context *sc );
|
||||||
void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
|
void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data,
|
||||||
size_t len );
|
size_t len );
|
||||||
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
|
void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst );
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
__m512i buf[64>>2];
|
__m512i buf[64>>2];
|
||||||
__m512i val[5];
|
__m512i val[5];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
} __attribute__ ((aligned (128))) ripemd160_16way_context;
|
} __attribute__ ((aligned (128))) ripemd160_16x32_context;
|
||||||
|
|
||||||
void ripemd160_16way_init( ripemd160_16way_context *sc );
|
void ripemd160_16x32_init( ripemd160_16x32_context *sc );
|
||||||
void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
|
void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data,
|
||||||
size_t len );
|
size_t len );
|
||||||
void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
|
void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst );
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|||||||
@@ -46,7 +46,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
|
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
|
||||||
#define ASM 0
|
#define ASM 0
|
||||||
#else
|
#else
|
||||||
#define ASM 1
|
#define ASM 1
|
||||||
@@ -597,6 +597,45 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
|
|||||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||||
|
|
||||||
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
|
v128_t *V = (v128_t*)v;
|
||||||
|
|
||||||
|
#define ROUND( r ) \
|
||||||
|
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||||
|
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
|
||||||
|
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
|
||||||
|
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
|
||||||
|
V[2] = v128_add32( V[2], V[3] ); \
|
||||||
|
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
|
||||||
|
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||||
|
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
|
||||||
|
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
|
||||||
|
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
|
||||||
|
V[2] = v128_add32( V[2], V[3] ); \
|
||||||
|
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
|
||||||
|
V[0] = v128_shufll32( V[0] ); \
|
||||||
|
V[3] = v128_swap64( V[3] ); \
|
||||||
|
V[2] = v128_shuflr32( V[2] ); \
|
||||||
|
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||||
|
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
|
||||||
|
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
|
||||||
|
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
|
||||||
|
V[2] = v128_add32( V[2], V[3] ); \
|
||||||
|
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
|
||||||
|
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||||
|
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
|
||||||
|
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
|
||||||
|
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
|
||||||
|
V[2] = v128_add32( V[2], V[3] ); \
|
||||||
|
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
|
||||||
|
V[0] = v128_shuflr32( V[0] ); \
|
||||||
|
V[3] = v128_swap64( V[3] ); \
|
||||||
|
V[2] = v128_shufll32( V[2] )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define G(r,i,a,b,c,d) \
|
#define G(r,i,a,b,c,d) \
|
||||||
do { \
|
do { \
|
||||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||||
@@ -619,6 +658,9 @@ static void blake2s_compress(blake2s_state *S, const void *buf) {
|
|||||||
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
|
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
|
||||||
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
|
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
ROUND(0);
|
ROUND(0);
|
||||||
ROUND(1);
|
ROUND(1);
|
||||||
ROUND(2);
|
ROUND(2);
|
||||||
|
|||||||
@@ -745,7 +745,7 @@ do{ \
|
|||||||
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
|
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Tested OK but very slow
|
// Tested OK but very slow
|
||||||
// 16 way parallel, requires 16x32 interleaving
|
// 16 way parallel, requires 16x32 interleaving
|
||||||
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
|
|||||||
v128_ovly v;
|
v128_ovly v;
|
||||||
for ( int l = 0; l < 4; l++ )
|
for ( int l = 0; l < 4; l++ )
|
||||||
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
||||||
X[i] = v128_xor( X[i], v.m128 );
|
X[i] = v128_xor( X[i], v.v128 );
|
||||||
}
|
}
|
||||||
|
|
||||||
xor_salsa8_4way( &X[ 0], &X[16] );
|
xor_salsa8_4way( &X[ 0], &X[16] );
|
||||||
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
|||||||
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
|
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
|
||||||
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
|
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
|
||||||
|
|
||||||
y[0].m128 = X0;
|
y[0].v128 = X0;
|
||||||
y[1].m128 = X1;
|
y[1].v128 = X1;
|
||||||
y[2].m128 = X2;
|
y[2].v128 = X2;
|
||||||
y[3].m128 = X3;
|
y[3].v128 = X3;
|
||||||
|
|
||||||
z[0].u32[0] = y[0].u32[0];
|
z[0].u32[0] = y[0].u32[0];
|
||||||
z[0].u32[3] = y[1].u32[0];
|
z[0].u32[3] = y[1].u32[0];
|
||||||
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
|||||||
z[3].u32[1] = y[2].u32[3];
|
z[3].u32[1] = y[2].u32[3];
|
||||||
z[3].u32[0] = y[3].u32[3];
|
z[3].u32[0] = y[3].u32[3];
|
||||||
|
|
||||||
B[0] = v128_add32( B[0], z[0].m128 );
|
B[0] = v128_add32( B[0], z[0].v128 );
|
||||||
B[1] = v128_add32( B[1], z[1].m128 );
|
B[1] = v128_add32( B[1], z[1].v128 );
|
||||||
B[2] = v128_add32( B[2], z[2].m128 );
|
B[2] = v128_add32( B[2], z[2].v128 );
|
||||||
B[3] = v128_add32( B[3], z[3].m128 );
|
B[3] = v128_add32( B[3], z[3].v128 );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
|||||||
/*
|
/*
|
||||||
v128_ovly ya[4], za[4], yb[4], zb[4];
|
v128_ovly ya[4], za[4], yb[4], zb[4];
|
||||||
|
|
||||||
ya[0].m128 = XA[0];
|
ya[0].v128 = XA[0];
|
||||||
yb[0].m128 = XB[0];
|
yb[0].v128 = XB[0];
|
||||||
ya[1].m128 = XA[1];
|
ya[1].v128 = XA[1];
|
||||||
yb[1].m128 = XB[1];
|
yb[1].v128 = XB[1];
|
||||||
ya[2].m128 = XA[2];
|
ya[2].v128 = XA[2];
|
||||||
yb[2].m128 = XB[2];
|
yb[2].v128 = XB[2];
|
||||||
ya[3].m128 = XA[3];
|
ya[3].v128 = XA[3];
|
||||||
yb[3].m128 = XB[3];
|
yb[3].v128 = XB[3];
|
||||||
|
|
||||||
za[0].u32[0] = ya[0].u32[0];
|
za[0].u32[0] = ya[0].u32[0];
|
||||||
zb[0].u32[0] = yb[0].u32[0];
|
zb[0].u32[0] = yb[0].u32[0];
|
||||||
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
|||||||
za[3].u32[3] = ya[0].u32[3];
|
za[3].u32[3] = ya[0].u32[3];
|
||||||
zb[3].u32[3] = yb[0].u32[3];
|
zb[3].u32[3] = yb[0].u32[3];
|
||||||
|
|
||||||
XA[0] = za[0].m128;
|
XA[0] = za[0].v128;
|
||||||
XB[0] = zb[0].m128;
|
XB[0] = zb[0].v128;
|
||||||
XA[1] = za[1].m128;
|
XA[1] = za[1].v128;
|
||||||
XB[1] = zb[1].m128;
|
XB[1] = zb[1].v128;
|
||||||
XA[2] = za[2].m128;
|
XA[2] = za[2].v128;
|
||||||
XB[2] = zb[2].m128;
|
XB[2] = zb[2].v128;
|
||||||
XA[3] = za[3].m128;
|
XA[3] = za[3].v128;
|
||||||
XB[3] = zb[3].m128;
|
XB[3] = zb[3].v128;
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
|
|||||||
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
|
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
|
||||||
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
SALSA_8ROUNDS_SIMD128_2BUF;
|
SALSA_8ROUNDS_SIMD128_2BUF;
|
||||||
|
|
||||||
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
|||||||
/*
|
/*
|
||||||
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
||||||
|
|
||||||
ya[0].m128 = XA[0];
|
ya[0].v128 = XA[0];
|
||||||
yb[0].m128 = XB[0];
|
yb[0].v128 = XB[0];
|
||||||
yc[0].m128 = XC[0];
|
yc[0].v128 = XC[0];
|
||||||
ya[1].m128 = XA[1];
|
ya[1].v128 = XA[1];
|
||||||
yb[1].m128 = XB[1];
|
yb[1].v128 = XB[1];
|
||||||
yc[1].m128 = XC[1];
|
yc[1].v128 = XC[1];
|
||||||
ya[2].m128 = XA[2];
|
ya[2].v128 = XA[2];
|
||||||
yb[2].m128 = XB[2];
|
yb[2].v128 = XB[2];
|
||||||
yc[2].m128 = XC[2];
|
yc[2].v128 = XC[2];
|
||||||
ya[3].m128 = XA[3];
|
ya[3].v128 = XA[3];
|
||||||
yb[3].m128 = XB[3];
|
yb[3].v128 = XB[3];
|
||||||
yc[3].m128 = XC[3];
|
yc[3].v128 = XC[3];
|
||||||
|
|
||||||
za[0].u32[0] = ya[0].u32[0];
|
za[0].u32[0] = ya[0].u32[0];
|
||||||
zb[0].u32[0] = yb[0].u32[0];
|
zb[0].u32[0] = yb[0].u32[0];
|
||||||
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
|||||||
zb[3].u32[3] = yb[0].u32[3];
|
zb[3].u32[3] = yb[0].u32[3];
|
||||||
zc[3].u32[3] = yc[0].u32[3];
|
zc[3].u32[3] = yc[0].u32[3];
|
||||||
|
|
||||||
XA[0] = za[0].m128;
|
XA[0] = za[0].v128;
|
||||||
XB[0] = zb[0].m128;
|
XB[0] = zb[0].v128;
|
||||||
XC[0] = zc[0].m128;
|
XC[0] = zc[0].v128;
|
||||||
XA[1] = za[1].m128;
|
XA[1] = za[1].v128;
|
||||||
XB[1] = zb[1].m128;
|
XB[1] = zb[1].v128;
|
||||||
XC[1] = zc[1].m128;
|
XC[1] = zc[1].v128;
|
||||||
XA[2] = za[2].m128;
|
XA[2] = za[2].v128;
|
||||||
XB[2] = zb[2].m128;
|
XB[2] = zb[2].v128;
|
||||||
XC[2] = zc[2].m128;
|
XC[2] = zc[2].v128;
|
||||||
XA[3] = za[3].m128;
|
XA[3] = za[3].v128;
|
||||||
XB[3] = zb[3].m128;
|
XB[3] = zb[3].v128;
|
||||||
XC[3] = zc[3].m128;
|
XC[3] = zc[3].v128;
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
|
|||||||
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
|
||||||
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
|
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
SALSA_8ROUNDS_SIMD128_3BUF;
|
SALSA_8ROUNDS_SIMD128_3BUF;
|
||||||
|
|
||||||
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
|||||||
xf = (B[15] ^= C[15]);
|
xf = (B[15] ^= C[15]);
|
||||||
|
|
||||||
|
|
||||||
#define ROL32( a, c ) ror32( a, c )
|
#define ROL32( a, c ) rol32( a, c )
|
||||||
#define ADD32( a, b ) ( (a)+(b) )
|
#define ADD32( a, b ) ( (a)+(b) )
|
||||||
#define XOR( a, b ) ( (a)^(b) )
|
#define XOR( a, b ) ( (a)^(b) )
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@
|
|||||||
//#include <mm_malloc.h>
|
//#include <mm_malloc.h>
|
||||||
#include "malloc-huge.h"
|
#include "malloc-huge.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define SCRYPT_THROUGHPUT 16
|
#define SCRYPT_THROUGHPUT 16
|
||||||
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||||
#define SCRYPT_THROUGHPUT 2
|
#define SCRYPT_THROUGHPUT 2
|
||||||
@@ -274,9 +274,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
|
|||||||
|
|
||||||
#endif // SHA
|
#endif // SHA
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
|
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
|
||||||
{
|
{
|
||||||
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||||
@@ -339,7 +336,7 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static inline void sha256_4way_init_state( void *state )
|
static inline void sha256_4x32_init_state( void *state )
|
||||||
{
|
{
|
||||||
casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
|
casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
|
||||||
casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
|
casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
|
||||||
@@ -362,21 +359,21 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
|
|||||||
memcpy( pad, key + 4*16, 4*16 );
|
memcpy( pad, key + 4*16, 4*16 );
|
||||||
memcpy( pad + 4*4, keypad_4way, 4*48 );
|
memcpy( pad + 4*4, keypad_4way, 4*48 );
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
|
sha256_4x32_transform_le( (v128_t*)ihash, (v128_t*)pad,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
|
|
||||||
sha256_4way_init_state( tstate );
|
sha256_4x32_init_state( tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||||
for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c;
|
for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c;
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
|
sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)pad,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
||||||
for ( ; i < 4*16; i++ ) pad[i] = 0x36363636;
|
for ( ; i < 4*16; i++ ) pad[i] = 0x36363636;
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
|
sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)pad,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -389,7 +386,7 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(16) obuf[4 * 16];
|
uint32_t _ALIGN(16) obuf[4 * 16];
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
|
sha256_4x32_transform_le( (v128_t*)istate, (v128_t*)salt,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
|
|
||||||
memcpy(ibuf, salt + 4 * 16, 4 * 16);
|
memcpy(ibuf, salt + 4 * 16, 4 * 16);
|
||||||
@@ -403,10 +400,10 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
|
|||||||
ibuf[4 * 4 + 2] = i + 1;
|
ibuf[4 * 4 + 2] = i + 1;
|
||||||
ibuf[4 * 4 + 3] = i + 1;
|
ibuf[4 * 4 + 3] = i + 1;
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
|
sha256_4x32_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
|
||||||
(const v128_t*)istate );
|
(const v128_t*)istate );
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
|
sha256_4x32_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
|
||||||
(const v128_t*)ostate );
|
(const v128_t*)ostate );
|
||||||
|
|
||||||
for ( j = 0; j < 4 * 8; j++ )
|
for ( j = 0; j < 4 * 8; j++ )
|
||||||
@@ -421,9 +418,9 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(64) buf[4 * 16];
|
uint32_t _ALIGN(64) buf[4 * 16];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
|
sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)salt,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
|
sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
|
|
||||||
final[ 0] = v128_32( 0x00000001 );
|
final[ 0] = v128_32( 0x00000001 );
|
||||||
@@ -434,20 +431,20 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
|
|||||||
= v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
|
= v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
|
||||||
final[15] = v128_32 ( 0x00000620 );
|
final[15] = v128_32 ( 0x00000620 );
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
|
sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)final,
|
||||||
(const v128_t*)tstate );
|
(const v128_t*)tstate );
|
||||||
|
|
||||||
memcpy(buf, tstate, 4 * 32);
|
memcpy(buf, tstate, 4 * 32);
|
||||||
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
|
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
|
||||||
|
|
||||||
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
|
sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)buf,
|
||||||
(const v128_t*)ostate );
|
(const v128_t*)ostate );
|
||||||
|
|
||||||
for ( i = 0; i < 4 * 8; i++ )
|
for ( i = 0; i < 4 * 8; i++ )
|
||||||
output[i] = bswap_32( ostate[i] );
|
output[i] = bswap_32( ostate[i] );
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_SHA256_8WAY
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
|
static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
|
||||||
@@ -470,7 +467,7 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = {
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static inline void sha256_8way_init_state( void *state )
|
static inline void sha256_8x32_init_state( void *state )
|
||||||
{
|
{
|
||||||
casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
|
casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 );
|
||||||
casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
|
casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 );
|
||||||
@@ -494,21 +491,21 @@ static inline void HMAC_SHA256_80_init_8way( const uint32_t *key,
|
|||||||
memset( pad + 8*5, 0x00, 8*40 );
|
memset( pad + 8*5, 0x00, 8*40 );
|
||||||
for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280;
|
for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad,
|
sha256_8x32_transform_le( (__m256i*)ihash, (__m256i*)pad,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
|
|
||||||
sha256_8way_init_state( tstate );
|
sha256_8x32_init_state( tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||||
for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c;
|
for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad,
|
sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)pad,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
||||||
for ( ; i < 8*16; i++ ) pad[i] = 0x36363636;
|
for ( ; i < 8*16; i++ ) pad[i] = 0x36363636;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad,
|
sha256_8x32_transform_le( (__m256i*)tstate, (__m256i*)pad,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -521,7 +518,7 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(32) obuf[8 * 16];
|
uint32_t _ALIGN(32) obuf[8 * 16];
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt,
|
sha256_8x32_transform_le( (__m256i*)istate, (__m256i*)salt,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
|
|
||||||
memcpy( ibuf, salt + 8*16, 8*16 );
|
memcpy( ibuf, salt + 8*16, 8*16 );
|
||||||
@@ -544,10 +541,10 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate,
|
|||||||
ibuf[8 * 4 + 6] = i + 1;
|
ibuf[8 * 4 + 6] = i + 1;
|
||||||
ibuf[8 * 4 + 7] = i + 1;
|
ibuf[8 * 4 + 7] = i + 1;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
|
sha256_8x32_transform_le( (__m256i*)obuf, (__m256i*)ibuf,
|
||||||
(const __m256i*)istate );
|
(const __m256i*)istate );
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
|
sha256_8x32_transform_le( (__m256i*)ostate2, (__m256i*)obuf,
|
||||||
(const __m256i*)ostate );
|
(const __m256i*)ostate );
|
||||||
|
|
||||||
for ( j = 0; j < 8*8; j++ )
|
for ( j = 0; j < 8*8; j++ )
|
||||||
@@ -562,9 +559,9 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(128) buf[ 8*16 ];
|
uint32_t _ALIGN(128) buf[ 8*16 ];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt,
|
sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)salt,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
|
sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16),
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
|
|
||||||
final[ 0] = _mm256_set1_epi32( 0x00000001 );
|
final[ 0] = _mm256_set1_epi32( 0x00000001 );
|
||||||
@@ -575,7 +572,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
|
|||||||
= _mm256_setzero_si256();
|
= _mm256_setzero_si256();
|
||||||
final[15] = _mm256_set1_epi32 ( 0x00000620 );
|
final[15] = _mm256_set1_epi32 ( 0x00000620 );
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)tstate, final,
|
sha256_8x32_transform_le( (__m256i*)tstate, final,
|
||||||
(const __m256i*)tstate );
|
(const __m256i*)tstate );
|
||||||
|
|
||||||
memcpy( buf, tstate, 8*32 );
|
memcpy( buf, tstate, 8*32 );
|
||||||
@@ -583,18 +580,18 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
|
|||||||
memset( buf + 8*9, 0x00, 8*24 );
|
memset( buf + 8*9, 0x00, 8*24 );
|
||||||
for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300;
|
for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300;
|
||||||
|
|
||||||
sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf,
|
sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)buf,
|
||||||
(const __m256i*)ostate );
|
(const __m256i*)ostate );
|
||||||
|
|
||||||
for (i = 0; i < 8 * 8; i++)
|
for (i = 0; i < 8 * 8; i++)
|
||||||
output[i] = bswap_32(ostate[i]);
|
output[i] = bswap_32(ostate[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HAVE_SHA256_8WAY */
|
#endif //AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static inline void sha256_16way_init_state( void *state )
|
static inline void sha256_16x32_init_state( void *state )
|
||||||
{
|
{
|
||||||
casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
|
casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 );
|
||||||
casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
|
casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 );
|
||||||
@@ -618,21 +615,21 @@ static inline void HMAC_SHA256_80_init_16way( const uint32_t *key,
|
|||||||
memset( pad + 16*5, 0x00, 16*40 );
|
memset( pad + 16*5, 0x00, 16*40 );
|
||||||
for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280;
|
for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad,
|
sha256_16x32_transform_le( (__m512i*)ihash, (__m512i*)pad,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
|
|
||||||
sha256_16way_init_state( tstate );
|
sha256_16x32_init_state( tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||||
for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c;
|
for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad,
|
sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)pad,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
|
|
||||||
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
|
||||||
for ( ; i < 16*16; i++ ) pad[i] = 0x36363636;
|
for ( ; i < 16*16; i++ ) pad[i] = 0x36363636;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad,
|
sha256_16x32_transform_le( (__m512i*)tstate, (__m512i*)pad,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -645,7 +642,7 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(128) ostate2[ 16*8 ];
|
uint32_t _ALIGN(128) ostate2[ 16*8 ];
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt,
|
sha256_16x32_transform_le( (__m512i*)istate, (__m512i*)salt,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
|
|
||||||
memcpy( ibuf, salt + 16*16, 16*16 );
|
memcpy( ibuf, salt + 16*16, 16*16 );
|
||||||
@@ -676,10 +673,10 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate,
|
|||||||
ibuf[ 16*4 + 14 ] = i + 1;
|
ibuf[ 16*4 + 14 ] = i + 1;
|
||||||
ibuf[ 16*4 + 15 ] = i + 1;
|
ibuf[ 16*4 + 15 ] = i + 1;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
|
sha256_16x32_transform_le( (__m512i*)obuf, (__m512i*)ibuf,
|
||||||
(const __m512i*)istate );
|
(const __m512i*)istate );
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
|
sha256_16x32_transform_le( (__m512i*)ostate2, (__m512i*)obuf,
|
||||||
(const __m512i*)ostate );
|
(const __m512i*)ostate );
|
||||||
|
|
||||||
for ( j = 0; j < 16*8; j++ )
|
for ( j = 0; j < 16*8; j++ )
|
||||||
@@ -694,9 +691,9 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
|
|||||||
uint32_t _ALIGN(128) buf[ 16*16 ];
|
uint32_t _ALIGN(128) buf[ 16*16 ];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt,
|
sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)salt,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
|
sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16),
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
|
|
||||||
final[ 0] = _mm512_set1_epi32( 0x00000001 );
|
final[ 0] = _mm512_set1_epi32( 0x00000001 );
|
||||||
@@ -707,7 +704,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
|
|||||||
= _mm512_setzero_si512();
|
= _mm512_setzero_si512();
|
||||||
final[15] = _mm512_set1_epi32 ( 0x00000620 );
|
final[15] = _mm512_set1_epi32 ( 0x00000620 );
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)tstate, final,
|
sha256_16x32_transform_le( (__m512i*)tstate, final,
|
||||||
(const __m512i*)tstate );
|
(const __m512i*)tstate );
|
||||||
|
|
||||||
memcpy( buf, tstate, 16*32 );
|
memcpy( buf, tstate, 16*32 );
|
||||||
@@ -715,7 +712,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
|
|||||||
memset( buf + 16*9, 0x00, 16*24 );
|
memset( buf + 16*9, 0x00, 16*24 );
|
||||||
for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300;
|
for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300;
|
||||||
|
|
||||||
sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf,
|
sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)buf,
|
||||||
(const __m512i*)ostate );
|
(const __m512i*)ostate );
|
||||||
|
|
||||||
for ( i = 0; i < 16*8; i++ )
|
for ( i = 0; i < 16*8; i++ )
|
||||||
@@ -724,25 +721,10 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate,
|
|||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
#define SCRYPT_MAX_WAYS 12
|
|
||||||
#define HAVE_SCRYPT_3WAY 1
|
|
||||||
void scrypt_core(uint32_t *X, uint32_t *V, int N);
|
|
||||||
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
|
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
|
||||||
#undef SCRYPT_MAX_WAYS
|
|
||||||
#define SCRYPT_MAX_WAYS 24
|
|
||||||
#define HAVE_SCRYPT_6WAY 1
|
|
||||||
void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef SCRYPT_MAX_WAYS
|
|
||||||
#define SCRYPT_MAX_WAYS 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "scrypt-core-4way.h"
|
#include "scrypt-core-4way.h"
|
||||||
|
|
||||||
/*
|
#if ( SCRYPT_THROUGHPUT == 1 )
|
||||||
|
|
||||||
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
|
static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
|
||||||
uint32_t *midstate, int N, int thr_id )
|
uint32_t *midstate, int N, int thr_id )
|
||||||
{
|
{
|
||||||
@@ -752,15 +734,12 @@ static bool scrypt_N_1_1_256( const uint32_t *input, uint32_t *output,
|
|||||||
memcpy(tstate, midstate, 32);
|
memcpy(tstate, midstate, 32);
|
||||||
HMAC_SHA256_80_init(input, tstate, ostate);
|
HMAC_SHA256_80_init(input, tstate, ostate);
|
||||||
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
|
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
|
||||||
|
scrypt_core_1way( X, scratchbuf, N );
|
||||||
scrypt_core_simd128( X, scratchbuf, N ); // woring
|
|
||||||
// scrypt_core_1way( X, V, N ); // working
|
|
||||||
// scrypt_core(X, V, N);
|
|
||||||
|
|
||||||
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if ( SCRYPT_THROUGHPUT == 8 )
|
#if ( SCRYPT_THROUGHPUT == 8 )
|
||||||
|
|
||||||
@@ -1201,20 +1180,6 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
|
|||||||
if ( work_restart[thrid].restart ) return 0;
|
if ( work_restart[thrid].restart ) return 0;
|
||||||
scrypt_core_simd128_2buf( X+448, V, N );
|
scrypt_core_simd128_2buf( X+448, V, N );
|
||||||
********************/
|
********************/
|
||||||
/*
|
|
||||||
scrypt_core_3way( X, V, N );
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
|
||||||
scrypt_core_3way( X+ 96, V, N );
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
|
||||||
scrypt_core_simd128_2buf( X+192, V, N );
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
|
||||||
scrypt_core_3way( X+256, V, N );
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
|
||||||
scrypt_core_3way( X+352, V, N );
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
|
||||||
scrypt_core_simd128_2buf( X+448, V, N );
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
if ( work_restart[thrid].restart ) return 0;
|
if ( work_restart[thrid].restart ) return 0;
|
||||||
|
|
||||||
@@ -1321,8 +1286,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
// SSE2
|
|
||||||
|
|
||||||
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
|
||||||
uint32_t *midstate, int N, int thrid )
|
uint32_t *midstate, int N, int thrid )
|
||||||
@@ -1481,7 +1445,7 @@ bool scrypt_miner_thread_init( int thr_id )
|
|||||||
bool register_scrypt_algo( algo_gate_t* gate )
|
bool register_scrypt_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
||||||
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
|
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
|
||||||
#else
|
#else
|
||||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||||
#endif
|
#endif
|
||||||
@@ -1491,31 +1455,31 @@ bool register_scrypt_algo( algo_gate_t* gate )
|
|||||||
opt_param_n = opt_param_n ? opt_param_n : 1024;
|
opt_param_n = opt_param_n ? opt_param_n : 1024;
|
||||||
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
|
applog( LOG_INFO,"Scrypt paramaters: N= %d, R= 1", opt_param_n );
|
||||||
|
|
||||||
// scrypt_throughput defined at compile time and used to replace
|
switch ( SCRYPT_THROUGHPUT )
|
||||||
// MAX_WAYS to reduce memory usage.
|
{
|
||||||
|
case 16: // AVX512
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
||||||
// scrypt_throughput = 16;
|
|
||||||
if ( opt_param_n > 0x4000 )
|
if ( opt_param_n > 0x4000 )
|
||||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||||
else
|
else
|
||||||
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
||||||
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
|
break;
|
||||||
// scrypt_throughput = 2;
|
case 2: // SHA256
|
||||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||||
#elif defined(__AVX2__)
|
break;
|
||||||
// scrypt_throughput = 8;
|
case 8: // AVX2
|
||||||
if ( opt_param_n > 0x4000 )
|
if ( opt_param_n > 0x4000 )
|
||||||
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
|
||||||
else
|
else
|
||||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
|
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
|
||||||
#else
|
break;
|
||||||
// scrypt_throughput = 4;
|
case 4: // SSE2, NEON
|
||||||
if ( opt_param_n > 0x4000 )
|
if ( opt_param_n > 0x4000 )
|
||||||
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
|
||||||
else
|
else
|
||||||
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
|
||||||
#endif
|
default:
|
||||||
|
scratchbuf_size = opt_param_n; // 1 way
|
||||||
|
}
|
||||||
|
|
||||||
char t_units[4] = {0};
|
char t_units[4] = {0};
|
||||||
char d_units[4] = {0};
|
char d_units[4] = {0};
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
#include "hmac-sha256-hash-4way.h"
|
#include "hmac-sha256-hash-4way.h"
|
||||||
#include "compat.h"
|
#include "compat.h"
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
// HMAC 4-way SSE2
|
// HMAC 4-way SSE2
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
|
|||||||
/* If Klen > 64, the key is really SHA256(K). */
|
/* If Klen > 64, the key is really SHA256(K). */
|
||||||
if ( Klen > 64 )
|
if ( Klen > 64 )
|
||||||
{
|
{
|
||||||
sha256_4way_init( &ctx->ictx );
|
sha256_4x32_init( &ctx->ictx );
|
||||||
sha256_4way_update( &ctx->ictx, K, Klen );
|
sha256_4x32_update( &ctx->ictx, K, Klen );
|
||||||
sha256_4way_close( &ctx->ictx, khash );
|
sha256_4x32_close( &ctx->ictx, khash );
|
||||||
K = khash;
|
K = khash;
|
||||||
Klen = 32;
|
Klen = 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||||
sha256_4way_init( &ctx->ictx );
|
sha256_4x32_init( &ctx->ictx );
|
||||||
memset( pad, 0x36, 64*4 );
|
memset( pad, 0x36, 64*4 );
|
||||||
|
|
||||||
for ( i = 0; i < Klen; i++ )
|
for ( i = 0; i < Klen; i++ )
|
||||||
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
|
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
|
||||||
casti_m128i( K, i ) );
|
casti_v128u32( K, i ) );
|
||||||
|
|
||||||
sha256_4way_update( &ctx->ictx, pad, 64 );
|
sha256_4x32_update( &ctx->ictx, pad, 64 );
|
||||||
|
|
||||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||||
sha256_4way_init( &ctx->octx );
|
sha256_4x32_init( &ctx->octx );
|
||||||
memset( pad, 0x5c, 64*4 );
|
memset( pad, 0x5c, 64*4 );
|
||||||
for ( i = 0; i < Klen/4; i++ )
|
for ( i = 0; i < Klen/4; i++ )
|
||||||
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
|
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
|
||||||
casti_m128i( K, i ) );
|
casti_v128u32( K, i ) );
|
||||||
sha256_4way_update( &ctx->octx, pad, 64 );
|
sha256_4x32_update( &ctx->octx, pad, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Add bytes to the HMAC-SHA256 operation. */
|
/* Add bytes to the HMAC-SHA256 operation. */
|
||||||
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
|
|||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
/* Feed data to the inner SHA256 operation. */
|
/* Feed data to the inner SHA256 operation. */
|
||||||
sha256_4way_update( &ctx->ictx, in, len );
|
sha256_4x32_update( &ctx->ictx, in, len );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finish an HMAC-SHA256 operation. */
|
/* Finish an HMAC-SHA256 operation. */
|
||||||
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
|
|||||||
unsigned char ihash[32*4] __attribute__ ((aligned (64)));
|
unsigned char ihash[32*4] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
/* Finish the inner SHA256 operation. */
|
/* Finish the inner SHA256 operation. */
|
||||||
sha256_4way_close( &ctx->ictx, ihash );
|
sha256_4x32_close( &ctx->ictx, ihash );
|
||||||
|
|
||||||
/* Feed the inner hash to the outer SHA256 operation. */
|
/* Feed the inner hash to the outer SHA256 operation. */
|
||||||
sha256_4way_update( &ctx->octx, ihash, 32 );
|
sha256_4x32_update( &ctx->octx, ihash, 32 );
|
||||||
|
|
||||||
/* Finish the outer SHA256 operation. */
|
/* Finish the outer SHA256 operation. */
|
||||||
sha256_4way_close( &ctx->octx, digest );
|
sha256_4x32_close( &ctx->octx, digest );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
|||||||
hmac_sha256_4way_context PShctx, hctx;
|
hmac_sha256_4way_context PShctx, hctx;
|
||||||
uint8_t _ALIGN(128) T[32*4];
|
uint8_t _ALIGN(128) T[32*4];
|
||||||
uint8_t _ALIGN(128) U[32*4];
|
uint8_t _ALIGN(128) U[32*4];
|
||||||
__m128i ivec;
|
v128u32_t ivec;
|
||||||
size_t i, clen;
|
size_t i, clen;
|
||||||
uint64_t j;
|
uint64_t j;
|
||||||
int k;
|
int k;
|
||||||
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
|||||||
for ( i = 0; i * 32 < dkLen; i++ )
|
for ( i = 0; i * 32 < dkLen; i++ )
|
||||||
{
|
{
|
||||||
/* Generate INT(i + 1). */
|
/* Generate INT(i + 1). */
|
||||||
ivec = _mm_set1_epi32( bswap_32( i+1 ) );
|
ivec = v128_32( bswap_32( i+1 ) );
|
||||||
|
|
||||||
/* Compute U_1 = PRF(P, S || INT(i)). */
|
/* Compute U_1 = PRF(P, S || INT(i)). */
|
||||||
memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
|
memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
|
||||||
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
|||||||
|
|
||||||
/* ... xor U_j ... */
|
/* ... xor U_j ... */
|
||||||
for ( k = 0; k < 8; k++ )
|
for ( k = 0; k < 8; k++ )
|
||||||
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
|
casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
|
||||||
casti_m128i( U, k ) );
|
casti_v128u32( U, k ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Copy as many bytes as necessary into buf. */
|
/* Copy as many bytes as necessary into buf. */
|
||||||
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
|
|||||||
/* If Klen > 64, the key is really SHA256(K). */
|
/* If Klen > 64, the key is really SHA256(K). */
|
||||||
if ( Klen > 64 )
|
if ( Klen > 64 )
|
||||||
{
|
{
|
||||||
sha256_8way_init( &ctx->ictx );
|
sha256_8x32_init( &ctx->ictx );
|
||||||
sha256_8way_update( &ctx->ictx, K, Klen );
|
sha256_8x32_update( &ctx->ictx, K, Klen );
|
||||||
sha256_8way_close( &ctx->ictx, khash );
|
sha256_8x32_close( &ctx->ictx, khash );
|
||||||
K = khash;
|
K = khash;
|
||||||
Klen = 32;
|
Klen = 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||||
sha256_8way_init( &ctx->ictx );
|
sha256_8x32_init( &ctx->ictx );
|
||||||
memset( pad, 0x36, 64*8);
|
memset( pad, 0x36, 64*8);
|
||||||
|
|
||||||
for ( i = 0; i < Klen/4; i++ )
|
for ( i = 0; i < Klen/4; i++ )
|
||||||
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
||||||
casti_m256i( K, i ) );
|
casti_m256i( K, i ) );
|
||||||
|
|
||||||
sha256_8way_update( &ctx->ictx, pad, 64 );
|
sha256_8x32_update( &ctx->ictx, pad, 64 );
|
||||||
|
|
||||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||||
sha256_8way_init( &ctx->octx );
|
sha256_8x32_init( &ctx->octx );
|
||||||
memset( pad, 0x5c, 64*8 );
|
memset( pad, 0x5c, 64*8 );
|
||||||
for ( i = 0; i < Klen/4; i++ )
|
for ( i = 0; i < Klen/4; i++ )
|
||||||
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
||||||
casti_m256i( K, i ) );
|
casti_m256i( K, i ) );
|
||||||
sha256_8way_update( &ctx->octx, pad, 64 );
|
sha256_8x32_update( &ctx->octx, pad, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
|
|||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
/* Feed data to the inner SHA256 operation. */
|
/* Feed data to the inner SHA256 operation. */
|
||||||
sha256_8way_update( &ctx->ictx, in, len );
|
sha256_8x32_update( &ctx->ictx, in, len );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finish an HMAC-SHA256 operation. */
|
/* Finish an HMAC-SHA256 operation. */
|
||||||
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
|
|||||||
unsigned char ihash[32*8] __attribute__ ((aligned (128)));
|
unsigned char ihash[32*8] __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
/* Finish the inner SHA256 operation. */
|
/* Finish the inner SHA256 operation. */
|
||||||
sha256_8way_close( &ctx->ictx, ihash );
|
sha256_8x32_close( &ctx->ictx, ihash );
|
||||||
|
|
||||||
/* Feed the inner hash to the outer SHA256 operation. */
|
/* Feed the inner hash to the outer SHA256 operation. */
|
||||||
sha256_8way_update( &ctx->octx, ihash, 32 );
|
sha256_8x32_update( &ctx->octx, ihash, 32 );
|
||||||
|
|
||||||
/* Finish the outer SHA256 operation. */
|
/* Finish the outer SHA256 operation. */
|
||||||
sha256_8way_close( &ctx->octx, digest );
|
sha256_8x32_close( &ctx->octx, digest );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// HMAC 16-way AVX512
|
// HMAC 16-way AVX512
|
||||||
|
|
||||||
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
|
|||||||
/* If Klen > 64, the key is really SHA256(K). */
|
/* If Klen > 64, the key is really SHA256(K). */
|
||||||
if ( Klen > 64 )
|
if ( Klen > 64 )
|
||||||
{
|
{
|
||||||
sha256_16way_init( &ctx->ictx );
|
sha256_16x32_init( &ctx->ictx );
|
||||||
sha256_16way_update( &ctx->ictx, K, Klen );
|
sha256_16x32_update( &ctx->ictx, K, Klen );
|
||||||
sha256_16way_close( &ctx->ictx, khash );
|
sha256_16x32_close( &ctx->ictx, khash );
|
||||||
K = khash;
|
K = khash;
|
||||||
Klen = 32;
|
Klen = 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||||
sha256_16way_init( &ctx->ictx );
|
sha256_16x32_init( &ctx->ictx );
|
||||||
memset( pad, 0x36, 64*16 );
|
memset( pad, 0x36, 64*16 );
|
||||||
|
|
||||||
for ( i = 0; i < Klen; i++ )
|
for ( i = 0; i < Klen; i++ )
|
||||||
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
||||||
casti_m512i( K, i ) );
|
casti_m512i( K, i ) );
|
||||||
sha256_16way_update( &ctx->ictx, pad, 64 );
|
sha256_16x32_update( &ctx->ictx, pad, 64 );
|
||||||
|
|
||||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||||
sha256_16way_init( &ctx->octx );
|
sha256_16way_init( &ctx->octx );
|
||||||
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
|
|||||||
for ( i = 0; i < Klen/4; i++ )
|
for ( i = 0; i < Klen/4; i++ )
|
||||||
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
||||||
casti_m512i( K, i ) );
|
casti_m512i( K, i ) );
|
||||||
sha256_16way_update( &ctx->octx, pad, 64 );
|
sha256_16x32_update( &ctx->octx, pad, 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
|
|||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
/* Feed data to the inner SHA256 operation. */
|
/* Feed data to the inner SHA256 operation. */
|
||||||
sha256_16way_update( &ctx->ictx, in, len );
|
sha256_16x32_update( &ctx->ictx, in, len );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finish an HMAC-SHA256 operation. */
|
/* Finish an HMAC-SHA256 operation. */
|
||||||
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
|
|||||||
unsigned char ihash[32*16] __attribute__ ((aligned (128)));
|
unsigned char ihash[32*16] __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
/* Finish the inner SHA256 operation. */
|
/* Finish the inner SHA256 operation. */
|
||||||
sha256_16way_close( &ctx->ictx, ihash );
|
sha256_16x32_close( &ctx->ictx, ihash );
|
||||||
|
|
||||||
/* Feed the inner hash to the outer SHA256 operation. */
|
/* Feed the inner hash to the outer SHA256 operation. */
|
||||||
sha256_16way_update( &ctx->octx, ihash, 32 );
|
sha256_16x32_update( &ctx->octx, ihash, 32 );
|
||||||
|
|
||||||
/* Finish the outer SHA256 operation. */
|
/* Finish the outer SHA256 operation. */
|
||||||
sha256_16way_close( &ctx->octx, digest );
|
sha256_16x32_close( &ctx->octx, digest );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/*-
|
/*-
|
||||||
* Copyright 2005,2007,2009 Colin Percival
|
* Copyright 2005,2007,2009 Colin Percival
|
||||||
* Copyright 2020 JayDDee@gmailcom
|
* Copyright 2020 JayDDee246@gmailcom
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
@@ -38,11 +38,12 @@
|
|||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include "sha256-hash.h"
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
typedef struct _hmac_sha256_4way_context
|
typedef struct _hmac_sha256_4way_context
|
||||||
{
|
{
|
||||||
sha256_4way_context ictx;
|
sha256_4x32_context ictx;
|
||||||
sha256_4way_context octx;
|
sha256_4x32_context octx;
|
||||||
} hmac_sha256_4way_context;
|
} hmac_sha256_4way_context;
|
||||||
|
|
||||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||||
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
|
|||||||
|
|
||||||
typedef struct _hmac_sha256_8way_context
|
typedef struct _hmac_sha256_8way_context
|
||||||
{
|
{
|
||||||
sha256_8way_context ictx;
|
sha256_8x32_context ictx;
|
||||||
sha256_8way_context octx;
|
sha256_8x32_context octx;
|
||||||
} hmac_sha256_8way_context;
|
} hmac_sha256_8way_context;
|
||||||
|
|
||||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||||
@@ -84,12 +85,12 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct _hmac_sha256_16way_context
|
typedef struct _hmac_sha256_16way_context
|
||||||
{
|
{
|
||||||
sha256_16way_context ictx;
|
sha256_16x32_context ictx;
|
||||||
sha256_16way_context octx;
|
sha256_16x32_context octx;
|
||||||
} hmac_sha256_16way_context;
|
} hmac_sha256_16way_context;
|
||||||
|
|
||||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
|
||||||
|
|
||||||
#define sha1_neon_rounds( state_out, data, state_in ) \
|
#define sha1_neon_rounds( state_out, data, state_in ) \
|
||||||
{ \
|
{ \
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
|
|||||||
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
// SHA-256 4 way SSE2
|
// SHA-256 4 way SSE2
|
||||||
|
|
||||||
#define CHs(X, Y, Z) \
|
#define CHs(X, Y, Z) \
|
||||||
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
|
|||||||
v128_store( state_out + 7, H );
|
v128_store( state_out + 7, H );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# if 0
|
|
||||||
|
|
||||||
// Working correctly but still slower
|
|
||||||
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
|
||||||
const v128_t *state_in, const uint32_t *target )
|
|
||||||
{
|
|
||||||
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
|
|
||||||
v128_t vmask, targ, hash;
|
|
||||||
int t6_mask, flip;
|
|
||||||
v128_t W[16]; v128_memcpy( W, data, 16 );
|
|
||||||
|
|
||||||
A = v128_load( state_in );
|
|
||||||
B = v128_load( state_in+1 );
|
|
||||||
C = v128_load( state_in+2 );
|
|
||||||
D = v128_load( state_in+3 );
|
|
||||||
E = v128_load( state_in+4 );
|
|
||||||
F = v128_load( state_in+5 );
|
|
||||||
G = v128_load( state_in+6 );
|
|
||||||
H = v128_load( state_in+7 );
|
|
||||||
|
|
||||||
const v128_t IV7 = H;
|
|
||||||
const v128_t IV6 = G;
|
|
||||||
|
|
||||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
|
|
||||||
SHA256_4X32_MSG_EXPANSION( W );
|
|
||||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
|
||||||
SHA256_4X32_MSG_EXPANSION( W );
|
|
||||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
|
||||||
|
|
||||||
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
|
||||||
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
|
||||||
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
|
||||||
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
|
||||||
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
|
||||||
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
|
||||||
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
|
||||||
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
|
||||||
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
|
||||||
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
|
||||||
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
|
||||||
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
|
||||||
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
|
|
||||||
|
|
||||||
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
|
|
||||||
|
|
||||||
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
|
||||||
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
|
||||||
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
|
||||||
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 );
|
|
||||||
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 );
|
|
||||||
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
|
||||||
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
|
||||||
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
|
||||||
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
|
|
||||||
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
|
||||||
|
|
||||||
T0 = v128_add32( v128_32( K256[58] ),
|
|
||||||
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
|
|
||||||
B = v128_add32( B, T0 );
|
|
||||||
|
|
||||||
T1 = v128_add32( v128_32( K256[59] ),
|
|
||||||
v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
|
|
||||||
A = v128_add32( A, T1 );
|
|
||||||
|
|
||||||
T2 = v128_add32( v128_32( K256[60] ),
|
|
||||||
v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
|
|
||||||
H = v128_add32( H, T2 );
|
|
||||||
|
|
||||||
targ = v128_32( target[7] );
|
|
||||||
hash = v128_bswap32( v128_add32( H, IV7 ) );
|
|
||||||
|
|
||||||
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
|
|
||||||
|
|
||||||
if ( likely(
|
|
||||||
0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
|
|
||||||
|
|
||||||
// round 58 part 2
|
|
||||||
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
|
|
||||||
|
|
||||||
// round 61 part 1
|
|
||||||
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
|
|
||||||
T0 = v128_add32( v128_32( K256[61] ),
|
|
||||||
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
|
|
||||||
G = v128_add32( G, T0 );
|
|
||||||
|
|
||||||
if ( t6_mask )
|
|
||||||
{
|
|
||||||
targ = v128_and( vmask, v128_32( target[6] ) );
|
|
||||||
hash = v128_bswap32( v128_add32( G, IV6 ) );
|
|
||||||
|
|
||||||
if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
|
|
||||||
return 0;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
|
|
||||||
if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
|
|
||||||
v128_cmpgt32( hash, targ ) ) ) ) )
|
|
||||||
return 0;
|
|
||||||
else if ( target[6] == 0x80000000 )
|
|
||||||
{
|
|
||||||
if ( 0 == ( t6_mask & v128_movmask32(
|
|
||||||
v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// rounds 59 to 61 part 2
|
|
||||||
E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
|
|
||||||
D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
|
|
||||||
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
|
|
||||||
|
|
||||||
// rounds 62 & 63
|
|
||||||
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
|
|
||||||
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
|
||||||
|
|
||||||
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
|
|
||||||
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
|
|
||||||
|
|
||||||
state_out[0] = v128_add32( state_in[0], A );
|
|
||||||
state_out[1] = v128_add32( state_in[1], B );
|
|
||||||
state_out[2] = v128_add32( state_in[2], C );
|
|
||||||
state_out[3] = v128_add32( state_in[3], D );
|
|
||||||
state_out[4] = v128_add32( state_in[4], E );
|
|
||||||
state_out[5] = v128_add32( state_in[5], F );
|
|
||||||
state_out[6] = v128_add32( state_in[6], G );
|
|
||||||
state_out[7] = v128_add32( state_in[7], H );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void sha256_4x32_init( sha256_4x32_context *sc )
|
void sha256_4x32_init( sha256_4x32_context *sc )
|
||||||
{
|
{
|
||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
@@ -529,28 +394,30 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
|
|||||||
sha256_4x32_close( &ctx, dst );
|
sha256_4x32_close( &ctx, dst );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // SSE2 || NEON
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
// SHA-256 8 way
|
// SHA-256 8 way
|
||||||
|
|
||||||
#define BSG2_0x(x) \
|
#define BSG2_0x(x) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
|
mm256_xor3( mm256_ror_32( x, 2 ), \
|
||||||
mm256_ror_32( x, 13 ) ), \
|
mm256_ror_32( x, 13 ), \
|
||||||
mm256_ror_32( x, 22 ) )
|
mm256_ror_32( x, 22 ) )
|
||||||
|
|
||||||
#define BSG2_1x(x) \
|
#define BSG2_1x(x) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
|
mm256_xor3( mm256_ror_32( x, 6 ), \
|
||||||
mm256_ror_32( x, 11 ) ), \
|
mm256_ror_32( x, 11 ), \
|
||||||
mm256_ror_32( x, 25 ) )
|
mm256_ror_32( x, 25 ) )
|
||||||
|
|
||||||
#define SSG2_0x(x) \
|
#define SSG2_0x(x) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
|
mm256_xor3( mm256_ror_32( x, 7 ), \
|
||||||
mm256_ror_32( x, 18 ) ), \
|
mm256_ror_32( x, 18 ), \
|
||||||
_mm256_srli_epi32( x, 3 ) )
|
_mm256_srli_epi32( x, 3 ) )
|
||||||
|
|
||||||
#define SSG2_1x(x) \
|
#define SSG2_1x(x) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
|
mm256_xor3( mm256_ror_32( x, 17 ), \
|
||||||
mm256_ror_32( x, 19 ) ), \
|
mm256_ror_32( x, 19 ), \
|
||||||
_mm256_srli_epi32( x, 10 ) )
|
_mm256_srli_epi32( x, 10 ) )
|
||||||
|
|
||||||
#define SHA256_8WAY_MEXP( a, b, c, d ) \
|
#define SHA256_8WAY_MEXP( a, b, c, d ) \
|
||||||
@@ -574,62 +441,6 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
|
|||||||
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
|
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
|
||||||
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||||
|
|
||||||
|
|
||||||
// With AVX512VL ternary logic optimizations are available.
|
|
||||||
// If not optimize by forwarding the result of X^Y in MAJ to the next round
|
|
||||||
// to avoid recalculating it as Y^Z. This optimization is not applicable
|
|
||||||
// when MAJ is optimized with ternary logic.
|
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
|
||||||
|
|
||||||
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
|
||||||
|
|
||||||
#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 )
|
|
||||||
|
|
||||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
|
||||||
do { \
|
|
||||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
|
|
||||||
W[ i ] ); \
|
|
||||||
__m256i T1 = BSG2_1x( E ); \
|
|
||||||
__m256i T2 = BSG2_0x( A ); \
|
|
||||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
|
||||||
T1 = _mm256_add_epi32( T1, H ); \
|
|
||||||
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
|
|
||||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
|
||||||
D = _mm256_add_epi32( D, T1 ); \
|
|
||||||
H = _mm256_add_epi32( T1, T2 ); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
|
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
|
|
||||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
|
|
||||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, j ); \
|
|
||||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, j ); \
|
|
||||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, j ); \
|
|
||||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, j ); \
|
|
||||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, j ); \
|
|
||||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, j ); \
|
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, j ); \
|
|
||||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, j ); \
|
|
||||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, j ); \
|
|
||||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, j ); \
|
|
||||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, j ); \
|
|
||||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, j ); \
|
|
||||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
|
|
||||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
|
|
||||||
|
|
||||||
// Not used with AVX512, needed to satisfy the compiler
|
|
||||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
|
||||||
{ \
|
|
||||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
|
||||||
v256_32( K256[(i)+(j)] ) ); \
|
|
||||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
|
||||||
D = _mm256_add_epi32( D, T1 ); \
|
|
||||||
H = _mm256_add_epi32( T1, T2 ); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // AVX2
|
|
||||||
|
|
||||||
#define CHx(X, Y, Z) \
|
#define CHx(X, Y, Z) \
|
||||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||||
|
|
||||||
@@ -641,61 +452,58 @@ do { \
|
|||||||
|
|
||||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||||
{ \
|
{ \
|
||||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
H = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||||
v256_32( K256[(i)+(j)] ) ); \
|
v256_32( K256[(i)+(j)] ) ); \
|
||||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
__m256i T = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||||
Y_xor_Z = X_xor_Y; \
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = _mm256_add_epi32( D, T1 ); \
|
D = _mm256_add_epi32( D, H ); \
|
||||||
H = _mm256_add_epi32( T1, T2 ); \
|
H = _mm256_add_epi32( H, T ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||||
do { \
|
{ \
|
||||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
|
__m256i T1 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
|
||||||
__m256i T1 = BSG2_1x( E ); \
|
H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
|
||||||
__m256i T2 = BSG2_0x( A ); \
|
__m256i T2 = BSG2_0x( A ); \
|
||||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, H ); \
|
|
||||||
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
|
T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
H = _mm256_add_epi32( H, T1 ); \
|
||||||
Y_xor_Z = X_xor_Y; \
|
Y_xor_Z = X_xor_Y; \
|
||||||
D = _mm256_add_epi32( D, T1 ); \
|
D = _mm256_add_epi32( D, H ); \
|
||||||
H = _mm256_add_epi32( T1, T2 ); \
|
H = _mm256_add_epi32( H, T2 ); \
|
||||||
} while (0)
|
}
|
||||||
|
|
||||||
// read Y_xor_Z, update X_xor_Y
|
// read Y_xor_Z, update X_xor_Y
|
||||||
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
|
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
|
||||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||||
Y_xor_Z ) )
|
Y_xor_Z ) )
|
||||||
|
|
||||||
// start with toc initialized to y^z: toc = B ^ C
|
// start with toc initialized to y^z, toc = B ^ C for first ound.
|
||||||
// First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
|
// First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
|
||||||
// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.
|
// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.
|
||||||
|
|
||||||
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
|
#define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
|
||||||
do { \
|
{ \
|
||||||
__m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
|
__m256i T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
|
||||||
W[ i0 ] ); \
|
W[ i0 ] ); \
|
||||||
__m256i T1 = BSG2_1x( E ); \
|
H = _mm256_add_epi32( H, BSG2_1x( E ) ); \
|
||||||
__m256i T2 = BSG2_0x( A ); \
|
__m256i T2 = BSG2_0x( A ); \
|
||||||
T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
|
T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, H ); \
|
|
||||||
T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
|
T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
H = _mm256_add_epi32( H, T1 ); \
|
||||||
D = _mm256_add_epi32( D, T1 ); \
|
D = _mm256_add_epi32( D, H ); \
|
||||||
H = _mm256_add_epi32( T1, T2 ); \
|
H = _mm256_add_epi32( H, T2 ); \
|
||||||
\
|
\
|
||||||
T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
|
T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
|
||||||
W[ (i1) ] ); \
|
W[ (i1) ] ); \
|
||||||
T1 = BSG2_1x( D ); \
|
G = _mm256_add_epi32( G, BSG2_1x( D ) ); \
|
||||||
T2 = BSG2_0x( H ); \
|
T2 = BSG2_0x( H ); \
|
||||||
T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \
|
T1 = _mm256_add_epi32( T1, CHx( D, E, F ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, G ); \
|
|
||||||
T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
|
T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
|
||||||
T1 = _mm256_add_epi32( T1, T0 ); \
|
G = _mm256_add_epi32( G, T1 ); \
|
||||||
C = _mm256_add_epi32( C, T1 ); \
|
C = _mm256_add_epi32( C, G ); \
|
||||||
G = _mm256_add_epi32( T1, T2 ); \
|
G = _mm256_add_epi32( G, T2 ); \
|
||||||
} while (0)
|
}
|
||||||
|
|
||||||
#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
|
#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
|
||||||
{ \
|
{ \
|
||||||
@@ -710,8 +518,6 @@ do { \
|
|||||||
SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \
|
SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX512VL else AVX2
|
|
||||||
|
|
||||||
static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
|
static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
|
||||||
const __m256i *in ) \
|
const __m256i *in ) \
|
||||||
{
|
{
|
||||||
@@ -745,7 +551,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// accepts LE input data
|
// accepts LE input data
|
||||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
|
||||||
const __m256i *state_in )
|
const __m256i *state_in )
|
||||||
{
|
{
|
||||||
__m256i W[16];
|
__m256i W[16];
|
||||||
@@ -754,7 +560,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Accepts BE input data, need to bswap
|
// Accepts BE input data, need to bswap
|
||||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
|
||||||
const __m256i *state_in )
|
const __m256i *state_in )
|
||||||
{
|
{
|
||||||
__m256i W[16];
|
__m256i W[16];
|
||||||
@@ -764,7 +570,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggressive prehashing, LE byte order
|
// Aggressive prehashing, LE byte order
|
||||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||||
const __m256i *W, const __m256i *state_in )
|
const __m256i *W, const __m256i *state_in )
|
||||||
{
|
{
|
||||||
__m256i A, B, C, D, E, F, G, H, T1;
|
__m256i A, B, C, D, E, F, G, H, T1;
|
||||||
@@ -788,9 +594,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
|||||||
G = _mm256_load_si256( state_in + 6 );
|
G = _mm256_load_si256( state_in + 6 );
|
||||||
H = _mm256_load_si256( state_in + 7 );
|
H = _mm256_load_si256( state_in + 7 );
|
||||||
|
|
||||||
#if !defined(__AVX512VL__)
|
|
||||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||||
#endif
|
|
||||||
|
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
||||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||||
@@ -813,7 +617,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
|||||||
_mm256_store_si256( state_mid + 7, H );
|
_mm256_store_si256( state_mid + 7, H );
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
|
||||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
|
const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
|
||||||
{
|
{
|
||||||
__m256i A, B, C, D, E, F, G, H;
|
__m256i A, B, C, D, E, F, G, H;
|
||||||
@@ -830,9 +634,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
|||||||
G = _mm256_load_si256( state_mid + 6 );
|
G = _mm256_load_si256( state_mid + 6 );
|
||||||
H = _mm256_load_si256( state_mid + 7 );
|
H = _mm256_load_si256( state_mid + 7 );
|
||||||
|
|
||||||
#if !defined(__AVX512VL__)
|
|
||||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
|
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
|
||||||
#endif
|
|
||||||
|
|
||||||
// round 3 part 2, add nonces
|
// round 3 part 2, add nonces
|
||||||
A = _mm256_add_epi32( A, W[3] );
|
A = _mm256_add_epi32( A, W[3] );
|
||||||
@@ -914,15 +716,13 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
|||||||
_mm256_store_si256( state_out + 7, H );
|
_mm256_store_si256( state_out + 7, H );
|
||||||
}
|
}
|
||||||
|
|
||||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||||
const __m256i *state_in, const uint32_t *target )
|
const __m256i *state_in, const uint32_t *target )
|
||||||
{
|
{
|
||||||
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
|
__m256i A, B, C, D, E, F, G, H, G57, H56;
|
||||||
__m256i vmask, targ, hash;
|
__m256i vmask, targ, hash;
|
||||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
uint8_t flip, t6_mask, t7_mask;
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
uint8_t flip, t6_mask;
|
|
||||||
|
|
||||||
A = _mm256_load_si256( state_in );
|
A = _mm256_load_si256( state_in );
|
||||||
B = _mm256_load_si256( state_in+1 );
|
B = _mm256_load_si256( state_in+1 );
|
||||||
@@ -933,12 +733,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
|||||||
G = _mm256_load_si256( state_in+6 );
|
G = _mm256_load_si256( state_in+6 );
|
||||||
H = _mm256_load_si256( state_in+7 );
|
H = _mm256_load_si256( state_in+7 );
|
||||||
|
|
||||||
const __m256i IV7 = H;
|
const __m256i istate6 = G;
|
||||||
const __m256i IV6 = G;
|
const __m256i istate7 = H;
|
||||||
|
|
||||||
#if !defined(__AVX512VL__)
|
|
||||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||||
#endif
|
|
||||||
|
|
||||||
// rounds 0 to 16, ignore zero padding W[9..14]
|
// rounds 0 to 16, ignore zero padding W[9..14]
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
||||||
@@ -981,11 +779,9 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
|||||||
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||||
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||||
|
|
||||||
#if !defined(__AVX512VL__)
|
|
||||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||||
#endif
|
|
||||||
|
|
||||||
// rounds 48 to 57
|
// Rounds 48 to 55
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
||||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
||||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
||||||
@@ -994,77 +790,83 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
|||||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
||||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
||||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
||||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
|
|
||||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
|
||||||
|
|
||||||
// round 58 to 60 part 1
|
// Round 56
|
||||||
T0 = _mm256_add_epi32( v256_32( K256[58] ),
|
H = _mm256_add_epi32( v256_32( K256[56] ),
|
||||||
|
mm256_add4_32( BSG2_1x( E ), CHx( E, F, G ), W[ 8], H ) );
|
||||||
|
D = _mm256_add_epi32( D, H );
|
||||||
|
H56 = _mm256_add_epi32( H, _mm256_add_epi32( BSG2_0x( A ),
|
||||||
|
MAJx( A, B, C ) ) );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
|
||||||
|
// Rounds 57 to 60 part 1
|
||||||
|
G = _mm256_add_epi32( v256_32( K256[57] ),
|
||||||
|
mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), W[ 9], G ) );
|
||||||
|
C = _mm256_add_epi32( C, G );
|
||||||
|
G57 = _mm256_add_epi32( G, MAJx( H56, A, B ) );
|
||||||
|
|
||||||
|
F = _mm256_add_epi32( v256_32( K256[58] ),
|
||||||
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
|
mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
|
||||||
B = _mm256_add_epi32( B, T0 );
|
B = _mm256_add_epi32( B, F );
|
||||||
|
|
||||||
T1 = _mm256_add_epi32( v256_32( K256[59] ),
|
E = _mm256_add_epi32( v256_32( K256[59] ),
|
||||||
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
|
mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
|
||||||
A = _mm256_add_epi32( A, T1 );
|
A = _mm256_add_epi32( A, E );
|
||||||
|
|
||||||
T2 = _mm256_add_epi32( v256_32( K256[60] ),
|
D = _mm256_add_epi32( v256_32( K256[60] ),
|
||||||
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
|
mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
|
||||||
H = _mm256_add_epi32( H, T2 );
|
H = _mm256_add_epi32( H56, D );
|
||||||
|
|
||||||
// Got H, test it.
|
// Got H, test it.
|
||||||
|
hash = mm256_bswap_32( _mm256_add_epi32( H, istate7 ) );
|
||||||
targ = v256_32( target[7] );
|
targ = v256_32( target[7] );
|
||||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
|
// A simple unsigned LE test is complicated by the lack of a cmple
|
||||||
if ( target[7] )
|
// instruction, and lack of unsigned compares in AVX2.
|
||||||
{
|
|
||||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||||
if ( likely( 0xff == ( flip ^
|
if ( likely( 0xff == ( t7_mask = ( flip ^
|
||||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
|
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )))
|
||||||
return 0;
|
return 0;
|
||||||
}
|
|
||||||
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
||||||
|
|
||||||
// round 58 part 2
|
// Round 57 part 2
|
||||||
F = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( G ),
|
G57 = _mm256_add_epi32( G57, BSG2_0x( H56 ) );
|
||||||
MAJx( G, H, A ) ) );
|
Y_xor_Z = X_xor_Y;
|
||||||
// round 61 part 1
|
|
||||||
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
|
||||||
T0 = _mm256_add_epi32( v256_32( K256[61] ),
|
|
||||||
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
|
|
||||||
G = _mm256_add_epi32( G, T0 );
|
|
||||||
|
|
||||||
if ( t6_mask )
|
// Round 61 part 1
|
||||||
|
W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||||
|
C = _mm256_add_epi32( v256_32( K256[61] ),
|
||||||
|
mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
|
||||||
|
G = _mm256_add_epi32( G57, C );
|
||||||
|
|
||||||
|
if ( t6_mask == (0xff & ~t7_mask ) )
|
||||||
{
|
{
|
||||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||||
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
||||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
hash = mm256_bswap_32( _mm256_add_epi32( G, istate6 ) );
|
||||||
|
|
||||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
|
||||||
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
|
|
||||||
{
|
|
||||||
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||||
if ( likely( 0 != ( t6_mask & ( flip ^
|
if ( likely( 0 != ( t6_mask & ( flip ^
|
||||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
|
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
|
||||||
return 0;
|
return 0;
|
||||||
if ( likely( ( target[6] == 0x80000000 )
|
|
||||||
&& ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
|
|
||||||
hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
// else inconclusive, testing targ5 isn't practical, fininsh hashing
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// At this point either the hash will be good or the test was inconclusive.
|
// Rounds 58 to 61 part 2
|
||||||
// If the latter it's probably a high target difficulty with a nearly equal
|
F = _mm256_add_epi32( F, _mm256_add_epi32( BSG2_0x( G57 ),
|
||||||
// high difficulty hash that has a good chance of being good.
|
MAJx( G57, H, A ) ) );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
|
||||||
// rounds 59 to 61 part 2
|
E = _mm256_add_epi32( E, _mm256_add_epi32( BSG2_0x( F ),
|
||||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
|
MAJx( F, G57, H ) ) );
|
||||||
MAJx( F, G, H ) ) );
|
Y_xor_Z = X_xor_Y;
|
||||||
D = _mm256_add_epi32( T2, _mm256_add_epi32( BSG2_0x( E ),
|
|
||||||
MAJx( E, F, G ) ) );
|
D = _mm256_add_epi32( D, _mm256_add_epi32( BSG2_0x( E ),
|
||||||
C = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( D ),
|
MAJx( E, F, G57 ) ) );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
|
||||||
|
C = _mm256_add_epi32( C, _mm256_add_epi32( BSG2_0x( D ),
|
||||||
MAJx( D, E, F ) ) );
|
MAJx( D, E, F ) ) );
|
||||||
|
Y_xor_Z = X_xor_Y;
|
||||||
|
|
||||||
// rounds 62 & 63
|
// Rounds 62 & 63
|
||||||
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] );
|
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||||
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||||
|
|
||||||
@@ -1083,8 +885,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void sha256_8x32_init( sha256_8x32_context *sc )
|
||||||
void sha256_8way_init( sha256_8way_context *sc )
|
|
||||||
{
|
{
|
||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
sc->val[0] = v256_32( sha256_iv[0] );
|
sc->val[0] = v256_32( sha256_iv[0] );
|
||||||
@@ -1100,7 +901,7 @@ void sha256_8way_init( sha256_8way_context *sc )
|
|||||||
// need to handle odd byte length for yespower.
|
// need to handle odd byte length for yespower.
|
||||||
// Assume only last update is odd.
|
// Assume only last update is odd.
|
||||||
|
|
||||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
@@ -1121,7 +922,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
|||||||
len -= clen;
|
len -= clen;
|
||||||
if ( ptr == buf_size )
|
if ( ptr == buf_size )
|
||||||
{
|
{
|
||||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
clow = sc->count_low;
|
clow = sc->count_low;
|
||||||
@@ -1132,7 +933,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
unsigned ptr;
|
unsigned ptr;
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
@@ -1146,7 +947,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
|||||||
if ( ptr > pad )
|
if ( ptr > pad )
|
||||||
{
|
{
|
||||||
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
memset_zero_256( sc->buf, pad >> 2 );
|
memset_zero_256( sc->buf, pad >> 2 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -1159,20 +960,20 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
|||||||
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
||||||
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
||||||
|
|
||||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
|
|
||||||
mm256_block_bswap_32( dst, sc->val );
|
mm256_block_bswap_32( dst, sc->val );
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_8way_full( void *dst, const void *data, size_t len )
|
void sha256_8x32_full( void *dst, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
sha256_8way_context ctx;
|
sha256_8x32_context ctx;
|
||||||
sha256_8way_init( &ctx );
|
sha256_8x32_init( &ctx );
|
||||||
sha256_8way_update( &ctx, data, len );
|
sha256_8x32_update( &ctx, data, len );
|
||||||
sha256_8way_close( &ctx, dst );
|
sha256_8x32_close( &ctx, dst );
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// SHA-256 16 way
|
// SHA-256 16 way
|
||||||
|
|
||||||
@@ -1218,40 +1019,26 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
|
|||||||
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||||
|
|
||||||
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
#define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||||
do { \
|
{ \
|
||||||
__m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
|
__m512i T1 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
|
||||||
__m512i T1 = BSG2_1x16( E ); \
|
H = _mm512_add_epi32( H, BSG2_1x16( E ) ); \
|
||||||
__m512i T2 = BSG2_0x16( A ); \
|
__m512i T2 = BSG2_0x16( A ); \
|
||||||
T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
|
T1 = _mm512_add_epi32( T1, CHx16( E, F, G ) ); \
|
||||||
T1 = _mm512_add_epi32( T1, H ); \
|
|
||||||
T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
|
T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \
|
||||||
T1 = _mm512_add_epi32( T1, T0 ); \
|
H = _mm512_add_epi32( H, T1 ); \
|
||||||
D = _mm512_add_epi32( D, T1 ); \
|
D = _mm512_add_epi32( D, H ); \
|
||||||
H = _mm512_add_epi32( T1, T2 ); \
|
H = _mm512_add_epi32( H, T2 ); \
|
||||||
} while (0)
|
}
|
||||||
|
|
||||||
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||||
{ \
|
{ \
|
||||||
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
H = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
||||||
v512_32( K256[(i)+(j)] ) ); \
|
v512_32( K256[(i)+(j)] ) ); \
|
||||||
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
__m512i T = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||||
D = _mm512_add_epi32( D, T1 ); \
|
D = _mm512_add_epi32( D, H ); \
|
||||||
H = _mm512_add_epi32( T1, T2 ); \
|
H = _mm512_add_epi32( H, T ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
|
|
||||||
do { \
|
|
||||||
__m512i T1, T2; \
|
|
||||||
__m512i K = v512_32( K256[( (j)+(i) )] ); \
|
|
||||||
T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
|
|
||||||
K, W[i] ) ); \
|
|
||||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
|
||||||
D = _mm512_add_epi32( D, T1 ); \
|
|
||||||
H = _mm512_add_epi32( T1, T2 ); \
|
|
||||||
} while (0)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
|
#define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
|
||||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
|
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
|
||||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
|
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
|
||||||
@@ -1302,7 +1089,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// accepts LE input data
|
// accepts LE input data
|
||||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
|
||||||
const __m512i *state_in )
|
const __m512i *state_in )
|
||||||
{
|
{
|
||||||
__m512i W[16];
|
__m512i W[16];
|
||||||
@@ -1311,7 +1098,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Accepts BE input data, need to bswap
|
// Accepts BE input data, need to bswap
|
||||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
|
||||||
const __m512i *state_in )
|
const __m512i *state_in )
|
||||||
{
|
{
|
||||||
__m512i W[16];
|
__m512i W[16];
|
||||||
@@ -1321,7 +1108,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggressive prehashing, LE byte order
|
// Aggressive prehashing, LE byte order
|
||||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||||
const __m512i *W, const __m512i *state_in )
|
const __m512i *W, const __m512i *state_in )
|
||||||
{
|
{
|
||||||
__m512i A, B, C, D, E, F, G, H, T1;
|
__m512i A, B, C, D, E, F, G, H, T1;
|
||||||
@@ -1369,7 +1156,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
|||||||
_mm512_store_si512( state_mid + 7, H );
|
_mm512_store_si512( state_mid + 7, H );
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
|
||||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
|
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
|
||||||
{
|
{
|
||||||
__m512i A, B, C, D, E, F, G, H;
|
__m512i A, B, C, D, E, F, G, H;
|
||||||
@@ -1470,15 +1257,12 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
|||||||
|
|
||||||
// returns 0 if hash aborted early and invalid,
|
// returns 0 if hash aborted early and invalid,
|
||||||
// returns 1 for completed hash with at least one valid candidate.
|
// returns 1 for completed hash with at least one valid candidate.
|
||||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||||
const __m512i *state_in, const uint32_t *target )
|
const __m512i *state_in, const uint32_t *target )
|
||||||
{
|
{
|
||||||
__m512i A, B, C, D, E, F, G, H, hash, targ;
|
__m512i A, B, C, D, E, F, G, H, hash, targ, G57, H56;
|
||||||
__m512i T0, T1, T2;
|
|
||||||
__m512i W[16]; memcpy_512( W, data, 16 );
|
__m512i W[16]; memcpy_512( W, data, 16 );
|
||||||
__mmask16 t6_mask;
|
__mmask16 mask;
|
||||||
const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
A = _mm512_load_si512( state_in );
|
A = _mm512_load_si512( state_in );
|
||||||
B = _mm512_load_si512( state_in+1 );
|
B = _mm512_load_si512( state_in+1 );
|
||||||
@@ -1489,8 +1273,8 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
|||||||
G = _mm512_load_si512( state_in+6 );
|
G = _mm512_load_si512( state_in+6 );
|
||||||
H = _mm512_load_si512( state_in+7 );
|
H = _mm512_load_si512( state_in+7 );
|
||||||
|
|
||||||
const __m512i IV6 = G;
|
const __m512i istate6 = G;
|
||||||
const __m512i IV7 = H;
|
const __m512i istate7 = H;
|
||||||
|
|
||||||
// rounds 0 to 8
|
// rounds 0 to 8
|
||||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
||||||
@@ -1562,7 +1346,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
|||||||
W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||||
W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||||
|
|
||||||
// Rounds 48 to 57
|
// Rounds 48 to 55
|
||||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
||||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
||||||
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
||||||
@@ -1571,62 +1355,67 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
|||||||
SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
||||||
SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
||||||
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
||||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
|
|
||||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
|
||||||
|
|
||||||
// rounds 58 to 60 part 1
|
// Round 56
|
||||||
T0 = _mm512_add_epi32( v512_32( K256[58] ),
|
H = _mm512_add_epi32( v512_32( K256[56] ),
|
||||||
|
mm512_add4_32( BSG2_1x16( E ), CHx16( E, F, G ), W[ 8], H ) );
|
||||||
|
D = _mm512_add_epi32( D, H );
|
||||||
|
H56 = _mm512_add_epi32( H, _mm512_add_epi32( BSG2_0x16( A ),
|
||||||
|
MAJx16( A, B, C ) ) );
|
||||||
|
|
||||||
|
// Rounds 57 to 60 part 1
|
||||||
|
G = _mm512_add_epi32( v512_32( K256[57] ),
|
||||||
|
mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) );
|
||||||
|
C = _mm512_add_epi32( C, G );
|
||||||
|
G57 = _mm512_add_epi32( G, MAJx16( H56, A, B ) );
|
||||||
|
|
||||||
|
F = _mm512_add_epi32( v512_32( K256[58] ),
|
||||||
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
|
mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
|
||||||
B = _mm512_add_epi32( B, T0 );
|
B = _mm512_add_epi32( B, F );
|
||||||
|
|
||||||
T1 = _mm512_add_epi32( v512_32( K256[59] ),
|
E = _mm512_add_epi32( v512_32( K256[59] ),
|
||||||
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
|
mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
|
||||||
A = _mm512_add_epi32( A, T1 );
|
A = _mm512_add_epi32( A, E );
|
||||||
|
|
||||||
T2 = _mm512_add_epi32( v512_32( K256[60] ),
|
D = _mm512_add_epi32( v512_32( K256[60] ),
|
||||||
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
|
mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
|
||||||
H = _mm512_add_epi32( H, T2 );
|
H = _mm512_add_epi32( H56, D );
|
||||||
|
|
||||||
// got H, test it against target[7]
|
// got final H, test it against target[7]
|
||||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
hash = mm512_bswap_32( _mm512_add_epi32( H , istate7 ) );
|
||||||
targ = v512_32( target[7] );
|
targ = v512_32( target[7] );
|
||||||
if ( target[7] )
|
if ( likely( 0 == ( mask = _mm512_cmple_epu32_mask( hash, targ ) ) ))
|
||||||
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
|
|
||||||
return 0;
|
return 0;
|
||||||
t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
|
|
||||||
|
|
||||||
// round 58 part 2
|
// Round 57 part 2
|
||||||
F = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( G ),
|
G57 = _mm512_add_epi32( G57, BSG2_0x16( H56 ) );
|
||||||
MAJx16( G, H, A ) ) );
|
|
||||||
|
|
||||||
// round 61 part 1
|
// Round 61 part 1
|
||||||
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||||
T0 = _mm512_add_epi32( v512_32( K256[61] ),
|
C = _mm512_add_epi32( v512_32( K256[61] ),
|
||||||
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
|
mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
|
||||||
G = _mm512_add_epi32( G, T0 );
|
G = _mm512_add_epi32( G57, C );
|
||||||
|
|
||||||
// got G, test it against target[6] if indicated
|
// got final G, test it against target[6] if indicated.
|
||||||
if ( (uint16_t)t6_mask )
|
if ( mask == _mm512_cmpeq_epi32_mask( hash, targ ) )
|
||||||
{
|
{
|
||||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
hash = mm512_bswap_32( _mm512_add_epi32( G, istate6 ) );
|
||||||
targ = v512_32( target[6] );
|
targ = v512_32( target[6] );
|
||||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( mask, hash, targ ) ))
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// round 59 part 2
|
// Round 58 to 61 part 2
|
||||||
E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16( F ),
|
F = _mm512_add_epi32( F, _mm512_add_epi32( BSG2_0x16( G57 ),
|
||||||
MAJx16( F, G, H ) ) );
|
MAJx16( G57, H, A ) ) );
|
||||||
|
E = _mm512_add_epi32( E, _mm512_add_epi32( BSG2_0x16( F ),
|
||||||
// round 60 part 2
|
MAJx16( F, G57, H ) ) );
|
||||||
D = _mm512_add_epi32( T2, _mm512_add_epi32( BSG2_0x16( E ),
|
D = _mm512_add_epi32( D, _mm512_add_epi32( BSG2_0x16( E ),
|
||||||
MAJx16( E, F, G ) ) );
|
MAJx16( E, F, G57 ) ) );
|
||||||
|
C = _mm512_add_epi32( C, _mm512_add_epi32( BSG2_0x16( D ),
|
||||||
// round 61 part 2
|
|
||||||
C = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( D ),
|
|
||||||
MAJx16( D, E, F ) ) );
|
MAJx16( D, E, F ) ) );
|
||||||
|
|
||||||
// rounds 62, 63
|
// Rounds 62, 63
|
||||||
W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] );
|
W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||||
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||||
|
|
||||||
@@ -1644,7 +1433,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_16way_init( sha256_16way_context *sc )
|
void sha256_16x32_init( sha256_16x32_context *sc )
|
||||||
{
|
{
|
||||||
sc->count_high = sc->count_low = 0;
|
sc->count_high = sc->count_low = 0;
|
||||||
sc->val[0] = v512_32( sha256_iv[0] );
|
sc->val[0] = v512_32( sha256_iv[0] );
|
||||||
@@ -1657,7 +1446,7 @@ void sha256_16way_init( sha256_16way_context *sc )
|
|||||||
sc->val[7] = v512_32( sha256_iv[7] );
|
sc->val[7] = v512_32( sha256_iv[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
|
||||||
size_t len )
|
size_t len )
|
||||||
{
|
{
|
||||||
__m512i *vdata = (__m512i*)data;
|
__m512i *vdata = (__m512i*)data;
|
||||||
@@ -1679,7 +1468,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
|||||||
len -= clen;
|
len -= clen;
|
||||||
if ( ptr == buf_size )
|
if ( ptr == buf_size )
|
||||||
{
|
{
|
||||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
clow = sc->count_low;
|
clow = sc->count_low;
|
||||||
@@ -1690,7 +1479,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
|
||||||
{
|
{
|
||||||
unsigned ptr;
|
unsigned ptr;
|
||||||
uint32_t low, high;
|
uint32_t low, high;
|
||||||
@@ -1704,7 +1493,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
|||||||
if ( ptr > pad )
|
if ( ptr > pad )
|
||||||
{
|
{
|
||||||
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
memset_zero_512( sc->buf, pad >> 2 );
|
memset_zero_512( sc->buf, pad >> 2 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -1717,17 +1506,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
|||||||
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
||||||
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
||||||
|
|
||||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||||
|
|
||||||
mm512_block_bswap_32( dst, sc->val );
|
mm512_block_bswap_32( dst, sc->val );
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_16way_full( void *dst, const void *data, size_t len )
|
void sha256_16x32_full( void *dst, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
sha256_16way_context ctx;
|
sha256_16x32_context ctx;
|
||||||
sha256_16way_init( &ctx );
|
sha256_16x32_init( &ctx );
|
||||||
sha256_16way_update( &ctx, data, len );
|
sha256_16x32_update( &ctx, data, len );
|
||||||
sha256_16way_close( &ctx, dst );
|
sha256_16x32_close( &ctx, dst );
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CH
|
#undef CH
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#include "sha256-hash.h"
|
#include "sha256-hash.h"
|
||||||
|
|
||||||
#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
|
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) )
|
||||||
|
|
||||||
static const uint32_t SHA256_IV[8] =
|
static const uint32_t SHA256_IV[8] =
|
||||||
{
|
{
|
||||||
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
|
|||||||
|
|
||||||
#if defined(__x86_64__) && defined(__SHA__)
|
#if defined(__x86_64__) && defined(__SHA__)
|
||||||
|
|
||||||
|
|
||||||
|
/* common code used for rounds 12 through 51 */
|
||||||
|
|
||||||
|
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
|
||||||
|
TMP = _mm_alignr_epi8( a, c, 4 ); \
|
||||||
|
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
|
||||||
|
b = _mm_add_epi32( b, TMP ); \
|
||||||
|
b = _mm_sha256msg2_epu32( b, a ); \
|
||||||
|
m = _mm_shuffle_epi32( m, 0x0e ); \
|
||||||
|
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
|
||||||
|
c = _mm_sha256msg1_epu32( c, a );
|
||||||
|
|
||||||
|
// r12-15
|
||||||
|
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
|
||||||
|
// r16-19
|
||||||
|
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
|
||||||
|
// r20-23
|
||||||
|
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
|
||||||
|
// r24-27
|
||||||
|
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
|
||||||
|
|
||||||
|
|
||||||
#define sha256_opt_rounds( state_out, input, state_in ) \
|
#define sha256_opt_rounds( state_out, input, state_in ) \
|
||||||
{ \
|
{ \
|
||||||
__m128i STATE0, STATE1; \
|
__m128i STATE0, STATE1; \
|
||||||
@@ -189,7 +211,7 @@ static const uint32_t SHA256_IV[8] =
|
|||||||
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
|
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
|
||||||
const uint32_t *state_in )
|
const uint32_t *state_in )
|
||||||
{
|
{
|
||||||
#define load_msg( m, i ) casti_v128( m, i )
|
#define load_msg( m, i ) casti_v128( m, i )
|
||||||
@@ -197,7 +219,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
|||||||
#undef load_msg
|
#undef load_msg
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input,
|
||||||
const uint32_t *state_in )
|
const uint32_t *state_in )
|
||||||
{
|
{
|
||||||
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
|
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
|
||||||
@@ -517,7 +539,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
|||||||
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
|
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||||
const void *msg_X, const void *msg_Y,
|
const void *msg_X, const void *msg_Y,
|
||||||
const uint32_t *in_X, const uint32_t *in_Y )
|
const uint32_t *in_X, const uint32_t *in_Y )
|
||||||
{
|
{
|
||||||
@@ -526,7 +548,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
|||||||
#undef load_msg
|
#undef load_msg
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||||
const void *msg_X, const void *msg_Y,
|
const void *msg_X, const void *msg_Y,
|
||||||
const uint32_t *in_X, const uint32_t *in_Y )
|
const uint32_t *in_X, const uint32_t *in_Y )
|
||||||
{
|
{
|
||||||
@@ -541,14 +563,14 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
|||||||
// The goal is to avoid any redundant processing in final. Prehash is almost
|
// The goal is to avoid any redundant processing in final. Prehash is almost
|
||||||
// 4 rounds total, only missing the final addition of the nonce.
|
// 4 rounds total, only missing the final addition of the nonce.
|
||||||
// Nonce must be set to zero for prehash.
|
// Nonce must be set to zero for prehash.
|
||||||
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
|
void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
|
||||||
uint32_t *sstate, const uint32_t *istate )
|
uint32_t *sstate, const uint32_t *istate )
|
||||||
{
|
{
|
||||||
__m128i STATE0, STATE1, MSG, TMP;
|
__m128i STATE0, STATE1, MSG, TMP;
|
||||||
|
|
||||||
// Load initial values
|
// Load initial values
|
||||||
TMP = casti_m128i( istate, 0 );
|
TMP = casti_v128u32( istate, 0 );
|
||||||
STATE1 = casti_m128i( istate, 1 );
|
STATE1 = casti_v128u32( istate, 1 );
|
||||||
|
|
||||||
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
|
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
|
||||||
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
|
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
|
||||||
@@ -556,20 +578,20 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
|
|||||||
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
|
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
|
||||||
|
|
||||||
// Save current hash
|
// Save current hash
|
||||||
casti_m128i( sstate, 0 ) = STATE0;
|
casti_v128u32( sstate, 0 ) = STATE0;
|
||||||
casti_m128i( sstate, 1 ) = STATE1;
|
casti_v128u32( sstate, 1 ) = STATE1;
|
||||||
|
|
||||||
// Rounds 0 to 3
|
// Rounds 0 to 3
|
||||||
MSG = casti_m128i( msg, 0 );
|
MSG = casti_v128u32( msg, 0 );
|
||||||
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
|
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
|
||||||
MSG = _mm_add_epi32( MSG, TMP );
|
MSG = _mm_add_epi32( MSG, TMP );
|
||||||
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
|
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
|
||||||
MSG = _mm_shuffle_epi32( MSG, 0x0E );
|
MSG = _mm_shuffle_epi32( MSG, 0x0E );
|
||||||
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
|
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
|
||||||
casti_m128i( ostate, 1 ) = STATE1;
|
casti_v128u32( ostate, 1 ) = STATE1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
||||||
const void *msg_X, const void *msg_Y,
|
const void *msg_X, const void *msg_Y,
|
||||||
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
||||||
const uint32_t *state_save_X, const uint32_t *state_save_Y )
|
const uint32_t *state_save_X, const uint32_t *state_save_Y )
|
||||||
@@ -579,22 +601,22 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
|||||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||||
|
|
||||||
STATE0_X = casti_m128i( state_mid_X, 0 );
|
STATE0_X = casti_v128u32( state_mid_X, 0 );
|
||||||
STATE1_X = casti_m128i( state_mid_X, 1 );
|
STATE1_X = casti_v128u32( state_mid_X, 1 );
|
||||||
STATE0_Y = casti_m128i( state_mid_Y, 0 );
|
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
|
||||||
STATE1_Y = casti_m128i( state_mid_Y, 1 );
|
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
|
||||||
|
|
||||||
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
|
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
|
||||||
TMSG0_X = casti_m128i( msg_X, 0 );
|
TMSG0_X = casti_v128u32( msg_X, 0 );
|
||||||
TMSG0_Y = casti_m128i( msg_Y, 0 );
|
TMSG0_Y = casti_v128u32( msg_Y, 0 );
|
||||||
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
|
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
|
||||||
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
|
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
|
||||||
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
|
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
|
||||||
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
|
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
|
||||||
|
|
||||||
// Rounds 4 to 7
|
// Rounds 4 to 7
|
||||||
TMSG1_X = casti_m128i( msg_X, 1 );
|
TMSG1_X = casti_v128u32( msg_X, 1 );
|
||||||
TMSG1_Y = casti_m128i( msg_Y, 1 );
|
TMSG1_Y = casti_v128u32( msg_Y, 1 );
|
||||||
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
|
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
|
||||||
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
|
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
|
||||||
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
|
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
|
||||||
@@ -616,8 +638,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
|||||||
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
|
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
|
||||||
|
|
||||||
// Rounds 12 to 15
|
// Rounds 12 to 15
|
||||||
TMSG3_X = casti_m128i( msg_X, 3 );
|
TMSG3_X = casti_v128u32( msg_X, 3 );
|
||||||
TMSG3_Y = casti_m128i( msg_Y, 3 );
|
TMSG3_Y = casti_v128u32( msg_Y, 3 );
|
||||||
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
|
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
|
||||||
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
|
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
|
||||||
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
|
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
|
||||||
@@ -845,20 +867,20 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
|
|||||||
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
|
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
|
||||||
|
|
||||||
// Add saved state to new state
|
// Add saved state to new state
|
||||||
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
|
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
|
||||||
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
|
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
|
||||||
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
|
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
|
||||||
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
|
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
|
||||||
|
|
||||||
// Unshuffle & save state
|
// Unshuffle & save state
|
||||||
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
|
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
|
||||||
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
|
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
|
||||||
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
|
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
|
||||||
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
|
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
|
||||||
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
|
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
|
||||||
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
|
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
|
||||||
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
|
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
|
||||||
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
|
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SHA
|
#endif // SHA
|
||||||
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
|
|||||||
|
|
||||||
#define sha256_neon_rounds( state_out, input, state_in ) \
|
#define sha256_neon_rounds( state_out, input, state_in ) \
|
||||||
{ \
|
{ \
|
||||||
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
|
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
|
||||||
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
|
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
|
||||||
uint32x4_t TMP0, TMP1, TMP2; \
|
uint32x4_t TMP0, TMP1, TMP2; \
|
||||||
\
|
\
|
||||||
STATE0 = vld1q_u32( state_in ); \
|
STATE0 = vld1q_u32( state_in ); \
|
||||||
STATE1 = vld1q_u32( state_in+4 ); \
|
STATE1 = vld1q_u32( state_in+4 ); \
|
||||||
ABEF_SAVE = STATE0; \
|
ABCD_SAVE = STATE0; \
|
||||||
CDGH_SAVE = STATE1; \
|
EFGH_SAVE = STATE1; \
|
||||||
\
|
\
|
||||||
MSG0 = load_msg( input, 0 ); \
|
MSG0 = load_msg( input, 0 ); \
|
||||||
MSG1 = load_msg( input, 1 ); \
|
MSG1 = load_msg( input, 1 ); \
|
||||||
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
|
|||||||
TMP2 = STATE0; \
|
TMP2 = STATE0; \
|
||||||
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
|
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
|
||||||
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
|
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
|
||||||
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
|
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
|
||||||
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
|
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
|
||||||
vst1q_u32( state_out , STATE0 ); \
|
vst1q_u32( state_out , STATE0 ); \
|
||||||
vst1q_u32( state_out+4, STATE1 ); \
|
vst1q_u32( state_out+4, STATE1 ); \
|
||||||
}
|
}
|
||||||
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
|||||||
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
|
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
|
||||||
input_Y, state_in_X, state_in_Y ) \
|
input_Y, state_in_X, state_in_Y ) \
|
||||||
{ \
|
{ \
|
||||||
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
|
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
|
||||||
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
|
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
|
||||||
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
|
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
|
||||||
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
|
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
|
||||||
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
|
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
|
||||||
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
|||||||
STATE0_Y = vld1q_u32( state_in_Y ); \
|
STATE0_Y = vld1q_u32( state_in_Y ); \
|
||||||
STATE1_X = vld1q_u32( state_in_X+4 ); \
|
STATE1_X = vld1q_u32( state_in_X+4 ); \
|
||||||
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
|
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
|
||||||
ABEF_SAVE_X = STATE0_X; \
|
ABCD_SAVE_X = STATE0_X; \
|
||||||
ABEF_SAVE_Y = STATE0_Y; \
|
ABCD_SAVE_Y = STATE0_Y; \
|
||||||
CDGH_SAVE_X = STATE1_X; \
|
EFGH_SAVE_X = STATE1_X; \
|
||||||
CDGH_SAVE_Y = STATE1_Y; \
|
EFGH_SAVE_Y = STATE1_Y; \
|
||||||
\
|
\
|
||||||
MSG0_X = load_msg( input_X, 0 ); \
|
MSG0_X = load_msg( input_X, 0 ); \
|
||||||
MSG0_Y = load_msg( input_Y, 0 ); \
|
MSG0_Y = load_msg( input_Y, 0 ); \
|
||||||
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
|||||||
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
|
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
|
||||||
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
|
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
|
||||||
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
|
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
|
||||||
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
|
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
|
||||||
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
|
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
|
||||||
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
|
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
|
||||||
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
|
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
|
||||||
vst1q_u32( state_out_X , STATE0_X ); \
|
vst1q_u32( state_out_X , STATE0_X ); \
|
||||||
vst1q_u32( state_out_Y , STATE0_Y ); \
|
vst1q_u32( state_out_Y , STATE0_Y ); \
|
||||||
vst1q_u32( state_out_X+4, STATE1_X ); \
|
vst1q_u32( state_out_X+4, STATE1_X ); \
|
||||||
|
|||||||
@@ -5,27 +5,21 @@
|
|||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include "cpuminer-config.h"
|
#include "cpuminer-config.h"
|
||||||
|
|
||||||
// generic interface
|
static const uint32_t SHA256_IV[8];
|
||||||
|
|
||||||
|
#if defined(__x86_64__) && defined(__SHA__)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
unsigned char buf[64]; /* first field, for alignment */
|
unsigned char buf[64];
|
||||||
uint32_t state[8];
|
uint32_t state[8];
|
||||||
uint64_t count;
|
uint64_t count;
|
||||||
} sha256_context __attribute__((aligned(64)));
|
} sha256_context __attribute__((aligned(64)));
|
||||||
|
|
||||||
static const uint32_t SHA256_IV[8];
|
|
||||||
|
|
||||||
void sha256_full( void *hash, const void *data, size_t len );
|
void sha256_full( void *hash, const void *data, size_t len );
|
||||||
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||||
void sha256_final( sha256_context *ctx, void *hash );
|
void sha256_final( sha256_context *ctx, void *hash );
|
||||||
void sha256_ctx_init( sha256_context *ctx );
|
void sha256_ctx_init( sha256_context *ctx );
|
||||||
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
|
|
||||||
const uint32_t *state_in );
|
|
||||||
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
|
|
||||||
const uint32_t *state_in );
|
|
||||||
|
|
||||||
#if defined(__x86_64__) && defined(__SHA__)
|
|
||||||
|
|
||||||
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
|
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
|
||||||
const uint32_t *state_in );
|
const uint32_t *state_in );
|
||||||
@@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
|
|||||||
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
||||||
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
||||||
|
|
||||||
// Temporary during name transition
|
|
||||||
#define sha256_opt_transform_le sha256_x86_sha_transform_le
|
|
||||||
#define sha256_opt_transform_be sha256_x86_sha_transform_be
|
|
||||||
#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le
|
|
||||||
#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be
|
|
||||||
#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds
|
|
||||||
#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds
|
|
||||||
|
|
||||||
// generic API
|
// generic API
|
||||||
#define sha256_transform_le sha256_x86_sha_transform_le
|
#define sha256_transform_le sha256_x86_sha_transform_le
|
||||||
#define sha256_transform_be sha256_x86_sha_transform_be
|
#define sha256_transform_be sha256_x86_sha_transform_be
|
||||||
@@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
|
|||||||
|
|
||||||
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
|
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
|
||||||
|
|
||||||
|
// SHA-256 AArch64 with NEON & SHA2
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
unsigned char buf[64];
|
||||||
|
uint32_t state[8];
|
||||||
|
uint64_t count;
|
||||||
|
} sha256_context __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
void sha256_full( void *hash, const void *data, size_t len );
|
||||||
|
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||||
|
void sha256_final( sha256_context *ctx, void *hash );
|
||||||
|
void sha256_ctx_init( sha256_context *ctx );
|
||||||
|
|
||||||
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
|
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
|
||||||
const uint32_t *state_in );
|
const uint32_t *state_in );
|
||||||
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
|
||||||
@@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
|
|||||||
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
||||||
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
||||||
|
|
||||||
// Temporary during name transition
|
|
||||||
#define sha256_transform_le sha256_neon_sha_transform_le
|
|
||||||
#define sha256_transform_be sha256_neon_sha_transform_be
|
|
||||||
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
|
|
||||||
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
|
|
||||||
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
|
|
||||||
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
|
|
||||||
|
|
||||||
// generic API
|
// generic API
|
||||||
#define sha256_transform_le sha256_neon_sha_transform_le
|
#define sha256_transform_le sha256_neon_sha_transform_le
|
||||||
#define sha256_transform_be sha256_neon_sha_transform_be
|
#define sha256_transform_be sha256_neon_sha_transform_be
|
||||||
@@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
|
|||||||
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
|
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// without HW acceleration...
|
// without HW acceleration...
|
||||||
#include "sph_sha2.h"
|
#include "sph_sha2.h"
|
||||||
|
|
||||||
|
#define sha256_context sph_sha256_context
|
||||||
#define sha256_full sph_sha256_full
|
#define sha256_full sph_sha256_full
|
||||||
#define sha256_ctx_init sph_sha256_init
|
#define sha256_ctx_init sph_sha256_init
|
||||||
#define sha256_update sph_sha256
|
#define sha256_update sph_sha256
|
||||||
@@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
|
|||||||
#define sha256_transform_be sph_sha256_transform_be
|
#define sha256_transform_be sph_sha256_transform_be
|
||||||
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// SHA-256 16 way
|
// SHA-256 16 way x86_64
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
|
|||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
// SHA-256 8 way
|
// SHA-256 8 way x86_64
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -187,21 +180,10 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
|
|||||||
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
|
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||||
const __m256i *state_in, const uint32_t *target );
|
const __m256i *state_in, const uint32_t *target );
|
||||||
|
|
||||||
// Temporary API during naming transition
|
|
||||||
#define sha256_8way_context sha256_8x32_context
|
|
||||||
#define sha256_8way_init sha256_8x32_init
|
|
||||||
#define sha256_8way_update sha256_8x32_update
|
|
||||||
#define sha256_8way_close sha256_8x32_close
|
|
||||||
#define sha256_8way_full sha256_8x32_full
|
|
||||||
#define sha256_8way_transform_le sha256_8x32_transform_le
|
|
||||||
#define sha256_8way_transform_be sha256_8x32_transform_be
|
|
||||||
#define sha256_8way_prehash_3rounds sha256_8x32_prehash_3rounds
|
|
||||||
#define sha256_8way_final_rounds sha256_8x32_final_rounds
|
|
||||||
#define sha256_8way_transform_le_short sha256_8x32_transform_le_short
|
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
// SHA-256 4 way
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -226,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
|
|||||||
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
||||||
const v128_t *state_in, const uint32_t *target );
|
const v128_t *state_in, const uint32_t *target );
|
||||||
|
|
||||||
// Temporary API during naming transition
|
#endif // SSE2 || NEON
|
||||||
#define sha256_4way_context sha256_4x32_context
|
#endif // SHA256_HASH_H__
|
||||||
#define sha256_4way_init sha256_4x32_init
|
|
||||||
#define sha256_4way_update sha256_4x32_update
|
|
||||||
#define sha256_4way_close sha256_4x32_close
|
|
||||||
#define sha256_4way_full sha256_4x32_full
|
|
||||||
#define sha256_4way_transform_le sha256_4x32_transform_le
|
|
||||||
#define sha256_4way_transform_be sha256_4x32_transform_be
|
|
||||||
#define sha256_4way_prehash_3rounds sha256_4x32_prehash_3rounds
|
|
||||||
#define sha256_4way_final_rounds sha256_4x32_final_rounds
|
|
||||||
#define sha256_4way_transform_le_short sha256_4x32_transform_le_short
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|||||||
@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
const v128_t shuf_bswap32 =
|
|
||||||
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
|
||||||
|
|
||||||
// hash first 64 byte block of data
|
// hash first 64 byte block of data
|
||||||
sha256_transform_le( mstatea, pdata, sha256_iv );
|
sha256_transform_le( mstatea, pdata, sha256_iv );
|
||||||
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||||
{
|
{
|
||||||
casti_v128( hasha, 0 ) =
|
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
|
||||||
_mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
|
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
|
||||||
casti_v128( hasha, 1 ) =
|
|
||||||
_mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
|
|
||||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
|||||||
}
|
}
|
||||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||||
{
|
{
|
||||||
casti_v128( hashb, 0 ) =
|
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
|
||||||
_mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
|
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
|
||||||
casti_v128( hashb, 1 ) =
|
|
||||||
_mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
|
|
||||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = n+1;
|
pdata[19] = n+1;
|
||||||
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const __m512i sixteen = v512_32( 16 );
|
const __m512i sixteen = v512_32( 16 );
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
// prehash first block directly from pdata
|
// prehash first block directly from pdata
|
||||||
sha256_transform_le( phash, pdata, sha256_iv );
|
sha256_transform_le( phash, pdata, sha256_iv );
|
||||||
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
buf[15] = v512_32( 80*8 ); // bit count
|
buf[15] = v512_32( 80*8 ); // bit count
|
||||||
|
|
||||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||||
|
|
||||||
// vectorize IV for second hash
|
// vectorize IV for second hash
|
||||||
istate[0] = v512_32( sha256_iv[0] );
|
istate[0] = v512_32( sha256_iv[0] );
|
||||||
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
|||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||||
if ( unlikely( sha256_16way_transform_le_short(
|
if ( unlikely( sha256_16x32_transform_le_short(
|
||||||
hash32, block, istate, ptarget ) ) )
|
hash32, block, istate, ptarget ) ) )
|
||||||
{
|
{
|
||||||
for ( int lane = 0; lane < 16; lane++ )
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
{
|
{
|
||||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||||
casti_m256i( phash, 0 ) =
|
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
|
||||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
|
||||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = n + lane;
|
pdata[19] = n + lane;
|
||||||
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
const __m256i last_byte = v256_32( 0x80000000 );
|
const __m256i last_byte = v256_32( 0x80000000 );
|
||||||
const __m256i eight = v256_32( 8 );
|
const __m256i eight = v256_32( 8 );
|
||||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
|
||||||
|
|
||||||
for ( int i = 0; i < 19; i++ )
|
for ( int i = 0; i < 19; i++ )
|
||||||
vdata[i] = v256_32( pdata[i] );
|
vdata[i] = v256_32( pdata[i] );
|
||||||
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
|||||||
istate[6] = v256_32( sha256_iv[6] );
|
istate[6] = v256_32( sha256_iv[6] );
|
||||||
istate[7] = v256_32( sha256_iv[7] );
|
istate[7] = v256_32( sha256_iv[7] );
|
||||||
|
|
||||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
sha256_8x32_transform_le( mstate1, vdata, istate );
|
||||||
|
|
||||||
// Do 3 rounds on the first 12 bytes of the next block
|
// Do 3 rounds on the first 12 bytes of the next block
|
||||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
|
||||||
istate, ptarget ) ) )
|
istate, ptarget ) ) )
|
||||||
{
|
{
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
{
|
{
|
||||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||||
casti_m256i( lane_hash, 0 ) =
|
casti_m256i( lane_hash, 0 ) =
|
||||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
|
||||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||||
{
|
{
|
||||||
pdata[19] = n + lane;
|
pdata[19] = n + lane;
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define SHA256D_16WAY 1
|
#define SHA256D_16WAY 1
|
||||||
#elif defined(__SHA__)
|
#elif defined(__SHA__)
|
||||||
#define SHA256D_SHA 1
|
#define SHA256D_SHA 1
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user