mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
21 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
66191db93c | ||
![]() |
dd99580a4c | ||
![]() |
1ed18bf22e | ||
![]() |
1d9341ee92 | ||
![]() |
a45a333b40 | ||
![]() |
2b1037a7c7 | ||
![]() |
06624a0ff2 | ||
![]() |
8e91bfbe19 | ||
![]() |
47e24b50e8 | ||
![]() |
c47c4a8885 | ||
![]() |
042d13d1e1 | ||
![]() |
4f930574cc | ||
![]() |
9d3a46c355 | ||
![]() |
4e3f1b926f | ||
![]() |
045b42babf | ||
![]() |
fc696dbbe5 | ||
![]() |
f3fde95f27 | ||
![]() |
0a78013cbe | ||
![]() |
26b9429589 | ||
![]() |
e043698442 | ||
![]() |
46dca7a493 |
90
Makefile.am
90
Makefile.am
@@ -1,27 +1,49 @@
|
||||
|
||||
if WANT_JANSSON
|
||||
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
|
||||
if HAVE_APPLE
|
||||
# MacOS uses Homebrew to install needed packages but they aren't linked for
|
||||
# the jansson test in configure. Ignore the failed test & link them now,
|
||||
# different path for different CPU arch.
|
||||
|
||||
if ARCH_ARM64
|
||||
EXTRA_INCLUDES = -I/opt/homebrew/include
|
||||
EXTRA_LIBS = -L/opt/homebrew/lib
|
||||
else
|
||||
JANSSON_INCLUDES=
|
||||
EXTRA_INCLUDES = -I/usr/local/include
|
||||
EXTRA_LIBS = -L/usr/local/lib
|
||||
endif
|
||||
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
else
|
||||
|
||||
SUBDIRS = compat
|
||||
if WANT_JANSSON
|
||||
# Can't find jansson libraries, compile the included source code.
|
||||
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
|
||||
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
|
||||
else
|
||||
EXTRA_INCLUDES =
|
||||
EXTRA_LIBS =
|
||||
endif
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
|
||||
endif
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
EXTRA_DIST = example-cfg.json nomacro.pl
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
SUBDIRS = compat
|
||||
|
||||
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
|
||||
|
||||
bin_PROGRAMS = cpuminer
|
||||
|
||||
dist_man_MANS = cpuminer.1
|
||||
|
||||
cpuminer_SOURCES = \
|
||||
dummy.cpp \
|
||||
cpu-miner.c \
|
||||
util.c \
|
||||
api.c \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
malloc-huge.c \
|
||||
simd-utils/simd-constants.c \
|
||||
algo/argon2d/argon2d-gate.c \
|
||||
algo/argon2d/blake2/blake2b.c \
|
||||
algo/argon2d/argon2d/argon2.c \
|
||||
@@ -79,11 +101,6 @@ cpuminer_SOURCES = \
|
||||
algo/hamsi/hamsi-hash-4way.c \
|
||||
algo/haval/haval.c \
|
||||
algo/haval/haval-hash-4way.c \
|
||||
algo/hodl/aes.c \
|
||||
algo/hodl/hodl-gate.c \
|
||||
algo/hodl/hodl-wolf.c \
|
||||
algo/hodl/sha512_avx.c \
|
||||
algo/hodl/sha512_avx2.c \
|
||||
algo/jh/sph_jh.c \
|
||||
algo/jh/jh-hash-4way.c \
|
||||
algo/jh/jha-gate.c \
|
||||
@@ -118,7 +135,6 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo/m7m/m7m.c \
|
||||
algo/m7m/magimath.cpp \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
@@ -148,6 +164,8 @@ cpuminer_SOURCES = \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/scrypt-core-4way.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/sha/sha1.c \
|
||||
algo/sha/sha1-hash.c \
|
||||
algo/sha/sha256-hash.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
@@ -156,7 +174,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
@@ -170,9 +187,6 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
algo/skein/sph_skein.c \
|
||||
@@ -254,6 +268,7 @@ cpuminer_SOURCES = \
|
||||
algo/x16/x16rt.c \
|
||||
algo/x16/x16rt-4way.c \
|
||||
algo/x16/hex.c \
|
||||
algo/x16/x20r.c \
|
||||
algo/x16/x21s-4way.c \
|
||||
algo/x16/x21s.c \
|
||||
algo/x16/minotaur.c \
|
||||
@@ -278,39 +293,29 @@ cpuminer_SOURCES = \
|
||||
algo/yespower/yespower-opt.c \
|
||||
algo/yespower/yespower-ref.c \
|
||||
algo/yespower/yespower-blake2b-ref.c
|
||||
|
||||
|
||||
disable_flags =
|
||||
|
||||
if USE_ASM
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
if ARCH_x86
|
||||
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
|
||||
endif
|
||||
if ARCH_x86_64
|
||||
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
|
||||
endif
|
||||
if ARCH_ARM
|
||||
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
|
||||
endif
|
||||
else
|
||||
disable_flags += -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_SOURCES += compat/winansi.c
|
||||
endif
|
||||
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lssl -lcrypto -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
if USE_ASM
|
||||
disable_flags =
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
else
|
||||
disable_flags = -DNOASM
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
cpuminer_CFLAGS += -Wl,--stack,10485760
|
||||
cpuminer_LDFLAGS = @LDFLAGS@
|
||||
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||
|
||||
if ARCH_ARM64
|
||||
cpuminer_CFLAGS += -flax-vector-conversions
|
||||
endif
|
||||
|
||||
if HAVE_WINDOWS
|
||||
|
||||
# use to profile an object
|
||||
# gprof_cflags = -pg -g3
|
||||
# cpuminer_LDFLAGS += -pg
|
||||
@@ -324,5 +329,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
|
||||
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
|
||||
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
|
||||
|
||||
|
||||
endif
|
||||
|
36
README.md
36
README.md
@@ -36,34 +36,18 @@ for compile instructions.
|
||||
Requirements
|
||||
------------
|
||||
|
||||
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
|
||||
Intel Core2 and newer and AMD equivalents. Further optimizations are available
|
||||
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
|
||||
|
||||
32 bit CPUs are not supported.
|
||||
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
|
||||
are not supported.
|
||||
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
|
||||
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
|
||||
|
||||
Mobile CPUs like laptop computers are not recommended because they aren't
|
||||
designed for extreme heat of operating at full load for extended periods of
|
||||
time.
|
||||
|
||||
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
|
||||
|
||||
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
|
||||
including Mint and Centos, are known to work and have all dependencies
|
||||
in their repositories. Others may work but may require more effort. Older
|
||||
versions such as Centos 6 don't work due to missing features.
|
||||
|
||||
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
|
||||
binaries. WindowsXP 64 bit is YMMV.
|
||||
|
||||
FreeBSD is not actively tested but should work, YMMV.
|
||||
MacOS, OSx and Android are not supported.
|
||||
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
|
||||
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
|
||||
|
||||
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
|
||||
RPC getwork using http:// or https://.
|
||||
GBT is YMMV.
|
||||
RPC getblocktemplate using http:// or https://.
|
||||
|
||||
Supported Algorithms
|
||||
--------------------
|
||||
@@ -71,9 +55,9 @@ Supported Algorithms
|
||||
allium Garlicoin
|
||||
anime Animecoin
|
||||
argon2 Argon2 coin (AR2)
|
||||
argon2d250 argon2d-crds, Credits (CRDS)
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
argon2d250
|
||||
argon2d500
|
||||
argon2d4096
|
||||
blake Blake-256
|
||||
blake2b Blake2-512
|
||||
blake2s Blake2-256
|
||||
@@ -87,7 +71,6 @@ Supported Algorithms
|
||||
groestl Groestl coin
|
||||
hex x16r-hex
|
||||
hmq1725
|
||||
hodl Hodlcoin
|
||||
jha Jackpotcoin
|
||||
keccak Maxcoin
|
||||
keccakc Creative coin
|
||||
@@ -115,9 +98,11 @@ Supported Algorithms
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptn2 scrypt(1048576, 1, 1)
|
||||
sha256d Double SHA-256
|
||||
sha256dt
|
||||
sha256q Quad SHA-256
|
||||
sha256t Triple SHA-256
|
||||
sha3d Double keccak256 (BSHA3)
|
||||
sha512256d
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
skunk Signatum (SIGT)
|
||||
@@ -145,6 +130,7 @@ Supported Algorithms
|
||||
x16rt-veil veil
|
||||
x16s
|
||||
x17
|
||||
x20r
|
||||
x21s
|
||||
x22i
|
||||
x25x
|
||||
|
167
RELEASE_NOTES
167
RELEASE_NOTES
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
|
||||
Requirements
|
||||
------------
|
||||
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
|
||||
- Arm CPU supporting AArch64 and NEON.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
32 bit CPUs are not supported.
|
||||
|
||||
ARM requirements (Beta):
|
||||
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
|
||||
|
||||
CPU: Armv8 and NEON, SHA2 & AES are optional
|
||||
OS: Linux distribution built for AArch64.
|
||||
Packages: source code only.
|
||||
Mining on mobile devices that meet the requirements is not recommended due to the risk of
|
||||
overheating and damaging the battery. Mining has unlimited demand, it will push any device
|
||||
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
|
||||
|
||||
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
|
||||
If a mobile CPU can mine it any CPU can.
|
||||
|
||||
See wiki for details.
|
||||
|
||||
@@ -73,6 +75,155 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v25.4
|
||||
|
||||
x86_64: improved handling of vector constants used for byte permutations.
|
||||
x86_64: removed hooks for cancelled AVX10-256.
|
||||
Minor bug fixes & improvements.
|
||||
More code cleanup.
|
||||
|
||||
v25.3
|
||||
|
||||
#442, #443: Fixed a regression in Makefile.am.
|
||||
Removed algo features log display.
|
||||
Some code cleanup.
|
||||
|
||||
v25.2
|
||||
|
||||
ARM: Fixed regression from v25.1 that could cause build fail.
|
||||
BSD: FreeBSD is now supported. Other BSDs may also work.
|
||||
MacOS: build with installed jansson library instead of compiling the included source code.
|
||||
Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
|
||||
Changed build.sh shell from bash to sh.
|
||||
|
||||
v25.1
|
||||
|
||||
MacOS ARM64: m7m algo is now working.
|
||||
MacOS ARM64: can now be compiled with GCC.
|
||||
MacOS x86_64: is now working compiled with GCC.
|
||||
Fixed some minor bugs & removed some obsolete code.
|
||||
|
||||
v24.8
|
||||
|
||||
ARM: Apple MacOS on M series CPU is now supported compiled from source
|
||||
code, see Wiki for details.
|
||||
ARM: Fix incorrect compiler version display when using clang.
|
||||
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
|
||||
have been removed.
|
||||
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
|
||||
continue to be compiled with CPU groups disabled.
|
||||
|
||||
v24.7
|
||||
|
||||
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
|
||||
|
||||
v24.6
|
||||
|
||||
ARM: Fixed scryptn2, x16*, broken in v24.2.
|
||||
ARM: Small improvement to interleaving.
|
||||
Eliminated some potential compile errors in code that was dependent on
|
||||
compiler optimisations.
|
||||
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
|
||||
|
||||
v24.5
|
||||
|
||||
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||
#427: GBT: Improved handling of new work.
|
||||
Removed shavite3 algo.
|
||||
|
||||
v24.4
|
||||
|
||||
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
|
||||
x86_64: fixed a bug in alignr macros for SSE2.
|
||||
ARM: CPU feature reporting enhancements.
|
||||
Some code cleanup.
|
||||
|
||||
v24.3
|
||||
|
||||
ARM: CPU feature detection and reporting is now working.
|
||||
ARM: Verthash is now working.
|
||||
ARM: Small speedup for yescrypt, yespower & argon2d.
|
||||
Code cleanup.
|
||||
|
||||
v24.2
|
||||
|
||||
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
|
||||
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
|
||||
ARM NEON: Various code optimisations.
|
||||
|
||||
v24.1
|
||||
|
||||
#414: fix bug in merkle error handling.
|
||||
#416: change $nproc to $(nproc) in build scripts.
|
||||
#420: change some inline function definitions to static inline.
|
||||
#413: Fix formatting error for share result log when using no-color.
|
||||
Faster 2 way interleaving.
|
||||
Cleanup sha256 architecture targetting.
|
||||
|
||||
v23.15
|
||||
|
||||
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
|
||||
ARM: Fugue AES optimizations enabled.
|
||||
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
|
||||
|
||||
v23.14
|
||||
|
||||
ARM: Groestl AES optimizations enabled.
|
||||
All: Small optimization to Shabal 4way.
|
||||
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
|
||||
All: deleted some unused files.
|
||||
|
||||
v23.13
|
||||
|
||||
Added x20r algo.
|
||||
Eliminated redundant hash order calculations for x16r family.
|
||||
|
||||
v23.12
|
||||
|
||||
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
|
||||
|
||||
v23.11
|
||||
|
||||
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
|
||||
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
|
||||
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
|
||||
|
||||
v23.10
|
||||
|
||||
x86_64: Fixed scrypt, scryptn2 algos SSE2.
|
||||
Fixed sha512256d algo AVX2, SSE2, NEON.
|
||||
Fixed a bug in Skein N-way that reduced performance.
|
||||
ARM: Skein optimized for NEON, SHA2 & SSE2.
|
||||
Skein2 algo 2-way optimized for NEON & SSE2.
|
||||
|
||||
v23.9
|
||||
|
||||
x86_64: fixed minotaurx crash, broken in 23.7.
|
||||
ARM: #407 fix compile error due to incorrect type casting for vrev instruction argument.
|
||||
|
||||
v23.8
|
||||
|
||||
Cpuminer-opt is no longer dependant on OpenSSL.
|
||||
Removed Hodl algo.
|
||||
Removed legacy Sha256 & Scrypt ASM code.
|
||||
ARM: Echo AES is working and enabled for x17.
|
||||
|
||||
v23.7
|
||||
|
||||
Fixed blakes2s, broken in v3.23.4.
|
||||
ARM: SHA2 extension tested and working.
|
||||
ARM: sha512256d fully optimized.
|
||||
ARM: X17 more optimizations.
|
||||
ARM: AES extension working for Shavite.
|
||||
ARM errata: CPU features AES & SHA256 are not reported when available.
|
||||
|
||||
v23.6
|
||||
|
||||
ARM: Sha256dt, Sha256t, Sha256d 4-way now working and fully optimized for NEON, SHA also enabled but untested.
|
||||
x86: Sha256dt, Sha256t, Sha256d faster SSE2 4-way.
|
||||
ARM: Scrypt, Scryptn2 fully optimized for NEON, SHA also enabled but untested.
|
||||
Linux: added a log when miner is started as root to discourage doing so.
|
||||
|
||||
v23.5
|
||||
|
||||
New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.
|
||||
|
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
|
||||
gate->build_block_header = (void*)&std_build_block_header;
|
||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||
gate->set_work_data_endian = (void*)&do_nothing;
|
||||
gate->resync_threads = (void*)&do_nothing;
|
||||
gate->do_this_thread = (void*)&return_true;
|
||||
// gate->resync_threads = (void*)&do_nothing;
|
||||
// gate->do_this_thread = (void*)&return_true;
|
||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
||||
gate->optimizations = EMPTY_SET;
|
||||
@@ -295,8 +295,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
|
||||
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
|
||||
@@ -310,7 +310,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_GROESTL: rc = register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEX: rc = register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: rc = register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: rc = register_hodl_algo ( gate ); break;
|
||||
case ALGO_JHA: rc = register_jha_algo ( gate ); break;
|
||||
case ALGO_KECCAK: rc = register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: rc = register_keccakc_algo ( gate ); break;
|
||||
@@ -341,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||
@@ -369,6 +367,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
||||
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
||||
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
||||
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
|
||||
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
||||
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||
@@ -417,8 +416,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
|
||||
const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "argon2d-dyn", "argon2d500" },
|
||||
{ "argon2d-uis", "argon2d4096" },
|
||||
{ "bcd", "x13bcd" },
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
|
@@ -98,25 +98,27 @@ typedef uint32_t set_t;
|
||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // AArch64
|
||||
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||
#define NEON_OPT 1 << 11 // AArch64
|
||||
#define AVX10_256 1 << 12
|
||||
#define AVX10_512 1 << 13
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
// AVX10_256 is compatible with AVX2 + VAES
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
|
||||
// return set contained common elements from sets a & b
|
||||
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
||||
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
||||
|
||||
// all elements in set a are included in set b
|
||||
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
||||
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
||||
|
||||
// no elements in set a are included in set b
|
||||
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
//bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( int, struct work* );
|
||||
//void ( *resync_threads ) ( int, struct work* );
|
||||
|
||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||
|
||||
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
@@ -6,9 +6,7 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
|
||||
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
|
||||
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
|
||||
|
||||
// Credits
|
||||
|
||||
void argon2d_crds_hash( void *output, const void *input )
|
||||
void argon2d250_hash( void *output, const void *input )
|
||||
{
|
||||
argon2_context context;
|
||||
context.out = (uint8_t *)output;
|
||||
@@ -34,7 +32,7 @@ void argon2d_crds_hash( void *output, const void *input )
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
@@ -50,7 +48,7 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
|
||||
do {
|
||||
be32enc(&edata[19], nonce);
|
||||
argon2d_crds_hash( hash, edata );
|
||||
argon2d250_hash( hash, edata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
@@ -64,18 +62,16 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
bool register_argon2d250_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->scanhash = (void*)&scanhash_argon2d250;
|
||||
gate->hash = (void*)&argon2d250_hash;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Dynamic
|
||||
|
||||
void argon2d_dyn_hash( void *output, const void *input )
|
||||
void argon2d500_hash( void *output, const void *input )
|
||||
{
|
||||
argon2_context context;
|
||||
context.out = (uint8_t *)output;
|
||||
@@ -101,7 +97,7 @@ void argon2d_dyn_hash( void *output, const void *input )
|
||||
argon2_ctx( &context, Argon2_d );
|
||||
}
|
||||
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
@@ -118,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
argon2d_dyn_hash( hash, edata );
|
||||
argon2d500_hash( hash, edata );
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !bench ) )
|
||||
{
|
||||
@@ -133,17 +129,15 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
bool register_argon2d500_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->scanhash = (void*)&scanhash_argon2d500;
|
||||
gate->hash = (void*)&argon2d500_hash;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Unitus
|
||||
|
||||
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
|
@@ -5,19 +5,19 @@
|
||||
#include <stdint.h>
|
||||
|
||||
// Credits: version = 0x10, m_cost = 250.
|
||||
bool register_argon2d_crds_algo( algo_gate_t* gate );
|
||||
bool register_argon2d250_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_crds_hash( void *state, const void *input );
|
||||
void argon2d250_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
// Dynamic: version = 0x10, m_cost = 500.
|
||||
bool register_argon2d_dyn_algo( algo_gate_t* gate );
|
||||
bool register_argon2d500_algo( algo_gate_t* gate );
|
||||
|
||||
void argon2d_dyn_hash( void *state, const void *input );
|
||||
void argon2d500_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
|
||||
|
@@ -35,7 +35,7 @@
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static inline __m512i blamka( __m512i x, __m512i y )
|
||||
{
|
||||
@@ -136,10 +136,10 @@ static void fill_block( __m256i *state, const block *ref_block,
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static void fill_block( v128_t *state, const block *ref_block,
|
||||
static void fill_block( v128u64_t *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
|
||||
v128u64_t block_XY[ARGON2_OWORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if ( with_xor )
|
||||
@@ -237,12 +237,12 @@ void fill_segment(const argon2_instance_t *instance,
|
||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint32_t starting_index, i;
|
||||
#if defined(__AVX512F__)
|
||||
#if defined(SIMD512)
|
||||
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
#elif defined(__AVX2__)
|
||||
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
||||
#else
|
||||
v128_t state[ARGON2_OWORDS_IN_BLOCK];
|
||||
v128u64_t state[ARGON2_OWORDS_IN_BLOCK];
|
||||
#endif
|
||||
// int data_independent_addressing;
|
||||
|
||||
|
@@ -21,58 +21,48 @@
|
||||
#include "blake2-impl.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if !defined(__AVX512F__)
|
||||
|
||||
#if !defined(SIMD512)
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
|
||||
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
const v128_t z = v128_mulw32(x, y);
|
||||
return v128_add64(v128_add64(x, y), v128_add64(z, z));
|
||||
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
|
||||
{
|
||||
const v128u64_t z = v128_mulw32( x, y );
|
||||
return (v128u32_t)v128_add64( v128_add64( (v128u64_t)x, (v128u64_t)y ),
|
||||
v128_add64( z, z ) );
|
||||
}
|
||||
|
||||
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = v128_xor(D0, A0); \
|
||||
D1 = v128_xor(D1, A1); \
|
||||
\
|
||||
D0 = v128_ror64(D0, 32); \
|
||||
D1 = v128_ror64(D1, 32); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = v128_xor(B0, C0); \
|
||||
B1 = v128_xor(B1, C1); \
|
||||
\
|
||||
B0 = v128_ror64(B0, 24); \
|
||||
B1 = v128_ror64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||
{ \
|
||||
A0 = fBlaMka( A0, B0 ); \
|
||||
A1 = fBlaMka( A1, B1 ); \
|
||||
D0 = v128_xor( D0, A0 ); \
|
||||
D1 = v128_xor( D1, A1 ); \
|
||||
D0 = v128_ror64( D0, 32 ); \
|
||||
D1 = v128_ror64( D1, 32 ); \
|
||||
C0 = fBlaMka( C0, D0 ); \
|
||||
C1 = fBlaMka( C1, D1 ); \
|
||||
B0 = v128_xor( B0, C0 ); \
|
||||
B1 = v128_xor( B1, C1 ); \
|
||||
B0 = v128_ror64( B0, 24 ); \
|
||||
B1 = v128_ror64( B1, 24 ); \
|
||||
}
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = v128_xor(D0, A0); \
|
||||
D1 = v128_xor(D1, A1); \
|
||||
\
|
||||
D0 = v128_ror64(D0, 16); \
|
||||
D1 = v128_ror64(D1, 16); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = v128_xor(B0, C0); \
|
||||
B1 = v128_xor(B1, C1); \
|
||||
\
|
||||
B0 = v128_ror64(B0, 63); \
|
||||
B1 = v128_ror64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
|
||||
{ \
|
||||
A0 = fBlaMka( A0, B0 ); \
|
||||
A1 = fBlaMka( A1, B1 ); \
|
||||
D0 = v128_xor( D0, A0 ); \
|
||||
D1 = v128_xor( D1, A1 ); \
|
||||
D0 = v128_ror64( D0, 16 ); \
|
||||
D1 = v128_ror64( D1, 16 ); \
|
||||
C0 = fBlaMka( C0, D0 ); \
|
||||
C1 = fBlaMka( C1, D1 ); \
|
||||
B0 = v128_xor( B0, C0 ); \
|
||||
B1 = v128_xor( B1, C1 ); \
|
||||
B0 = v128_ror64( B0, 63 ); \
|
||||
B1 = v128_ror64( B1, 63 ); \
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
|
||||
|
@@ -6,15 +6,15 @@
|
||||
|
||||
#if defined (BLAKE_4WAY)
|
||||
|
||||
blake256r14_4way_context blake_4w_ctx;
|
||||
blake256r14_4x32_context blake_4w_ctx;
|
||||
|
||||
void blakehash_4way(void *state, const void *input)
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r14_4way_context ctx;
|
||||
blake256r14_4x32_context ctx;
|
||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4x32_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
|
||||
@@ -35,11 +35,11 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
HTarget = 0x7f;
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r14_4way_init( &blake_4w_ctx );
|
||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
||||
blake256r14_4x32_init( &blake_4w_ctx );
|
||||
blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
blakehash_4way( hash, vdata );
|
||||
|
||||
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#if defined(BLAKE_8WAY)
|
||||
|
||||
blake256r14_8way_context blake_8w_ctx;
|
||||
blake256r14_8x32_context blake_8w_ctx;
|
||||
|
||||
void blakehash_8way( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256r14_8way_context ctx;
|
||||
blake256r14_8x32_context ctx;
|
||||
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
|
||||
blake256r14_8way( &ctx, input + (64<<3), 16 );
|
||||
blake256r14_8way_close( &ctx, vhash );
|
||||
blake256r14_8x32( &ctx, input + (64<<3), 16 );
|
||||
blake256r14_8x32_close( &ctx, vhash );
|
||||
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
|
||||
state+128, state+160, state+192, state+224,
|
||||
vhash, 256 );
|
||||
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
|
||||
blake256r14_8way_init( &blake_8w_ctx );
|
||||
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
|
||||
blake256r14_8x32_init( &blake_8w_ctx );
|
||||
blake256r14_8x32( &blake_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
|
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
||||
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
|
||||
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
|
||||
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
|
||||
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
|
||||
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
|
||||
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
|
||||
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
|
||||
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
|
||||
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
|
||||
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
|
||||
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
|
||||
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
|
||||
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
M0 = v128_bswap32( buf[0] ); \
|
||||
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
MF = v128_bswap32( buf[15] ); \
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define COMPRESS32_4X32( rounds ) \
|
||||
{ \
|
||||
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND_S_4X32_3;
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128_t shuf_bswap32 =
|
||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
|
||||
#else
|
||||
|
||||
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
|
||||
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
|
||||
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
|
||||
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
|
||||
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
|
||||
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -1291,24 +1244,22 @@ do { \
|
||||
VD = v256_32( T0 ^ 0x299F31D0 ); \
|
||||
VE = v256_32( T1 ^ 0x082EFA98 ); \
|
||||
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
|
||||
const __m256i shuf_bswap32 = mm256_set2_64( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||
M0 = mm256_bswap_32( * buf ); \
|
||||
M1 = mm256_bswap_32( *(buf+ 1) ); \
|
||||
M2 = mm256_bswap_32( *(buf+ 2) ); \
|
||||
M3 = mm256_bswap_32( *(buf+ 3) ); \
|
||||
M4 = mm256_bswap_32( *(buf+ 4) ); \
|
||||
M5 = mm256_bswap_32( *(buf+ 5) ); \
|
||||
M6 = mm256_bswap_32( *(buf+ 6) ); \
|
||||
M7 = mm256_bswap_32( *(buf+ 7) ); \
|
||||
M8 = mm256_bswap_32( *(buf+ 8) ); \
|
||||
M9 = mm256_bswap_32( *(buf+ 9) ); \
|
||||
MA = mm256_bswap_32( *(buf+10) ); \
|
||||
MB = mm256_bswap_32( *(buf+11) ); \
|
||||
MC = mm256_bswap_32( *(buf+12) ); \
|
||||
MD = mm256_bswap_32( *(buf+13) ); \
|
||||
ME = mm256_bswap_32( *(buf+14) ); \
|
||||
MF = mm256_bswap_32( *(buf+15) ); \
|
||||
ROUND_S_8WAY(0); \
|
||||
ROUND_S_8WAY(1); \
|
||||
ROUND_S_8WAY(2); \
|
||||
@@ -1401,7 +1352,7 @@ do { \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
}
|
||||
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data )
|
||||
{
|
||||
__m256i *M = (__m256i*)data;
|
||||
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
|
||||
}
|
||||
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m256i *H = (__m256i*)final_hash;
|
||||
@@ -1596,22 +1547,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_8WAY_3;
|
||||
}
|
||||
|
||||
const __m256i shuf_bswap32 =
|
||||
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
|
||||
H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
|
||||
H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
|
||||
H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
|
||||
H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
|
||||
H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
|
||||
H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
|
||||
H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
///////////////////////////////////////
|
||||
//
|
||||
@@ -1933,8 +1881,6 @@ do { \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
@@ -1951,22 +1897,22 @@ do { \
|
||||
VD = v512_32( T0 ^ 0x299F31D0 ); \
|
||||
VE = v512_32( T1 ^ 0x082EFA98 ); \
|
||||
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
|
||||
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||
M0 = mm512_bswap_32( * buf ); \
|
||||
M1 = mm512_bswap_32( *(buf+ 1) ); \
|
||||
M2 = mm512_bswap_32( *(buf+ 2) ); \
|
||||
M3 = mm512_bswap_32( *(buf+ 3) ); \
|
||||
M4 = mm512_bswap_32( *(buf+ 4) ); \
|
||||
M5 = mm512_bswap_32( *(buf+ 5) ); \
|
||||
M6 = mm512_bswap_32( *(buf+ 6) ); \
|
||||
M7 = mm512_bswap_32( *(buf+ 7) ); \
|
||||
M8 = mm512_bswap_32( *(buf+ 8) ); \
|
||||
M9 = mm512_bswap_32( *(buf+ 9) ); \
|
||||
MA = mm512_bswap_32( *(buf+10) ); \
|
||||
MB = mm512_bswap_32( *(buf+11) ); \
|
||||
MC = mm512_bswap_32( *(buf+12) ); \
|
||||
MD = mm512_bswap_32( *(buf+13) ); \
|
||||
ME = mm512_bswap_32( *(buf+14) ); \
|
||||
MF = mm512_bswap_32( *(buf+15) ); \
|
||||
ROUND_S_16WAY(0); \
|
||||
ROUND_S_16WAY(1); \
|
||||
ROUND_S_16WAY(2); \
|
||||
@@ -2063,7 +2009,7 @@ do { \
|
||||
// is constant for every nonce and only needs to be run once per job. The
|
||||
// second part is run for each nonce using the precalculated midstate and the
|
||||
// hash from the first block.
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data )
|
||||
{
|
||||
__m512i *M = (__m512i*)data;
|
||||
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
}
|
||||
|
||||
// Dfault is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m512i *H = (__m512i*)final_hash;
|
||||
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
}
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
|
||||
H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
|
||||
H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
|
||||
H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
|
||||
H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
|
||||
H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
|
||||
H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
|
||||
H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
int rounds )
|
||||
{
|
||||
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
||||
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
|
||||
|
||||
// Blake-256 8 way
|
||||
|
||||
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
|
||||
int rounds )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m256i buf[16];
|
||||
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m256i buf[16];
|
||||
@@ -2617,13 +2558,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
|
||||
int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
|
||||
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
static void
|
||||
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m512i buf[16];
|
||||
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m512i buf[16];
|
||||
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_init(void *cc)
|
||||
blake256_16x32_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
blake32_16way_init( cc, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_update(void *cc, const void *data, size_t len)
|
||||
blake256_16x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_close(void *cc, void *dst)
|
||||
blake256_16x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_update_le(void *cc, const void *data, size_t len)
|
||||
blake256_16x32_update_le(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way_le(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_close_le(void *cc, void *dst)
|
||||
blake256_16x32_close_le(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close_le(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r14_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
blake32_16way_init( cc, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_update(void *cc, const void *data, size_t len)
|
||||
blake256r14_16x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_close(void *cc, void *dst)
|
||||
blake256r14_16x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r8_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
|
||||
blake32_16way_init( cc, IV256, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_update(void *cc, const void *data, size_t len)
|
||||
blake256r8_16x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_close(void *cc, void *dst)
|
||||
blake256r8_16x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
|
||||
void
|
||||
blake256_4x32_init(void *ctx)
|
||||
{
|
||||
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
|
||||
blake32_4x32_init( ctx, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
|
||||
// Blake-256 8 way
|
||||
|
||||
void
|
||||
blake256_8way_init(void *cc)
|
||||
blake256_8x32_init(void *cc)
|
||||
{
|
||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
blake32_8way_init( cc, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_8way_update(void *cc, const void *data, size_t len)
|
||||
blake256_8x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_8way_close(void *cc, void *dst)
|
||||
blake256_8x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_8way_update_le(void *cc, const void *data, size_t len)
|
||||
blake256_8x32_update_le(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way_le(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_8way_close_le(void *cc, void *dst)
|
||||
blake256_8x32_close_le(void *cc, void *dst)
|
||||
{
|
||||
blake32_8way_close_le(cc, 0, 0, dst, 8);
|
||||
}
|
||||
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
|
||||
// 14 rounds Blake, Decred
|
||||
void blake256r14_4x32_init(void *cc)
|
||||
{
|
||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
|
||||
blake32_4x32_init( cc, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
void blake256r14_8way_init(void *cc)
|
||||
void blake256r14_8x32_init(void *cc)
|
||||
{
|
||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
blake32_8way_init( cc, IV256, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_8way_update(void *cc, const void *data, size_t len)
|
||||
blake256r14_8x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_8way_close(void *cc, void *dst)
|
||||
blake256r14_8x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
|
||||
// 8 rounds Blakecoin, Vanilla
|
||||
void blake256r8_4x32_init(void *cc)
|
||||
{
|
||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
|
||||
blake32_4x32_init( cc, IV256, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
void blake256r8_8way_init(void *cc)
|
||||
void blake256r8_8x32_init(void *cc)
|
||||
{
|
||||
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
|
||||
blake32_8way_init( cc, IV256, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_8way_update(void *cc, const void *data, size_t len)
|
||||
blake256r8_8x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_8way_close(void *cc, void *dst)
|
||||
blake256r8_8x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_8way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
@@ -29,13 +29,6 @@ typedef struct
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
/*
|
||||
void blake256_init( blake256_context *sc );
|
||||
void blake256_update( blake256_context *sc, const void *data, size_t len );
|
||||
void blake256_close( blake256_context *sc, void *dst );
|
||||
void blake256_full( blake256_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
*/
|
||||
|
||||
//////////////////////////////////
|
||||
//
|
||||
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
|
||||
void blake256_4x32_init(void *ctx);
|
||||
void blake256_4x32_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4x32_close(void *ctx, void *dst);
|
||||
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds
|
||||
typedef blake_4x32_small_context blake256r14_4x32_context;
|
||||
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
|
||||
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4x32_close(void *cc, void *dst);
|
||||
|
||||
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
#define blake_4way_small_context blake256_4x32_context
|
||||
#define blake256_4way_context blake256_4x32_context
|
||||
#define blake256_4way_init blake256_4x32_init
|
||||
#define blake256_4way_update blake256_4x32_update
|
||||
#define blake256_4way_close blake256_4x32_close
|
||||
#define blake256_4way_update_le blake256_4x32_update_le
|
||||
#define blake256_4way_close_le blake256_4x32_close_le
|
||||
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
|
||||
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
|
||||
#define blake256r14_4way_context blake256r14_4x32_context
|
||||
#define blake256r14_4way_init blake256r14_4x32_init
|
||||
#define blake256r14_4way_update blake256r14_4x32_update
|
||||
#define blake256r14_4way_close blake256r14_4x32_close
|
||||
#define blake256r8_4way_context blake256r14_4x32_context
|
||||
#define blake256r8_4way_init blake256r14_4x32_init
|
||||
#define blake256r8_4way_update blake256r14_4x32_update
|
||||
#define blake256r8_4way_close blake256r14_4x32_close
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
//////////////////////////////
|
||||
@@ -107,47 +81,30 @@ typedef struct
|
||||
} blake_8way_small_context;
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
typedef blake_8way_small_context blake256_8x32_context;
|
||||
void blake256_8x32_init(void *cc);
|
||||
void blake256_8x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8x32_close(void *cc, void *dst);
|
||||
void blake256_8x32_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8x32_close_le(void *cc, void *dst);
|
||||
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
void blake256r14_8way_init(void *cc);
|
||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_close(void *cc, void *dst);
|
||||
typedef blake_8way_small_context blake256r14_8x32_context;
|
||||
void blake256r14_8x32_init(void *cc);
|
||||
void blake256r14_8x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8x32_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
typedef blake_8way_small_context blake256r8_8x32_context;
|
||||
void blake256r8_8x32_init(void *cc);
|
||||
void blake256r8_8x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8x32_close(void *cc, void *dst);
|
||||
|
||||
#define blake_8x32_small_context blake256_8way_context
|
||||
#define blake_8x32_init blake256_8way_init
|
||||
#define blake_8x32_update blake256_8way_update
|
||||
#define blake_8x32_close blake256_8way_close
|
||||
#define blake_8x32_update_le blake256_8way_update_le
|
||||
#define blake_8x32_close_le blake256_8way_close_le
|
||||
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
|
||||
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
|
||||
#define blake256r14_8x32_context blake256r14_8way_context
|
||||
#define blake256r14_8x32_init blake256r14_8way_init
|
||||
#define blake256r14_8x32_update blake256r14_8way_update
|
||||
#define blake256r14_8x32_close blake256r14_8way_close
|
||||
#define blake256r8_8x32_context blake256r14_8way_context
|
||||
#define blake256r8_8x32_init blake256r14_8way_init
|
||||
#define blake256r8_8x32_update blake256r14_8way_update
|
||||
#define blake256r8_8x32_close blake256r14_8way_close
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
///////////////////////////////////
|
||||
//
|
||||
@@ -163,46 +120,29 @@ typedef struct
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
typedef blake_16way_small_context blake256_16x32_context;
|
||||
void blake256_16x32_init(void *cc);
|
||||
void blake256_16x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16x32_close(void *cc, void *dst);
|
||||
// Expects data in little endian order, no byte swap needed
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void blake256_16x32_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16x32_close_le(void *cc, void *dst);
|
||||
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
typedef blake_16way_small_context blake256r14_16x32_context;
|
||||
void blake256r14_16x32_init(void *cc);
|
||||
void blake256r14_16x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16x32_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
#define blake_16x32_small_context blake256_16way_context
|
||||
#define blake_16x32_init blake256_16way_init
|
||||
#define blake_16x32_update blake256_16way_update
|
||||
#define blake_16x32_close blake256_16way_close
|
||||
#define blake_16x32_update_le blake256_16way_update_le
|
||||
#define blake_16x32_close_le blake256_16way_close_le
|
||||
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
|
||||
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
|
||||
#define blake256r14_16x32_context blake256r14_16way_context
|
||||
#define blake256r14_16x32_init blake256r14_16way_init
|
||||
#define blake256r14_16x32_update blake256r14_16way_update
|
||||
#define blake256r14_16x32_close blake256r14_16way_close
|
||||
#define blake256r8_16x32_context blake256r8_16way_context
|
||||
#define blake256r8_16x32_init blake256r8_16way_init
|
||||
#define blake256r8_16x32_update blake256r8_16way_update
|
||||
#define blake256r8_16x32_close blake256r8_16way_close
|
||||
typedef blake_16way_small_context blake256r8_16x32_context;
|
||||
void blake256r8_16x32_init(void *cc);
|
||||
void blake256r8_16x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16x32_close(void *cc, void *dst);
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define B2B8W_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
|
||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
|
||||
{
|
||||
__m512i v[16], m[16];
|
||||
|
||||
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||
}
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
}
|
||||
|
||||
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m512i* in =(__m512i*)input;
|
||||
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_8way_compress( ctx, 0 );
|
||||
blake2b_8x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m512i( out, 0 ) = ctx->h[0];
|
||||
casti_m512i( out, 1 ) = ctx->h[1];
|
||||
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
|
||||
};
|
||||
*/
|
||||
|
||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
|
||||
{
|
||||
__m256i v[16], m[16];
|
||||
|
||||
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||
}
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen )
|
||||
{
|
||||
__m256i* in =(__m256i*)input;
|
||||
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
ctx->t[0] += ctx->c;
|
||||
if ( ctx->t[0] < ctx->c )
|
||||
ctx->t[1]++;
|
||||
blake2b_4way_compress( ctx, 0 );
|
||||
blake2b_4x64_compress( ctx, 0 );
|
||||
ctx->c = 0;
|
||||
}
|
||||
ctx->b[ c++ ] = in[i];
|
||||
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
}
|
||||
}
|
||||
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
|
||||
{
|
||||
size_t c;
|
||||
c = ctx->c >> 3;
|
||||
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
ctx->c += 8;
|
||||
}
|
||||
|
||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
||||
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
|
||||
|
||||
casti_m256i( out, 0 ) = ctx->h[0];
|
||||
casti_m256i( out, 1 ) = ctx->h[1];
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
||||
#define __BLAKE2B_HASH_4WAY_H__
|
||||
#ifndef BLAKE2B_HASH_4WAY_H__
|
||||
#define BLAKE2B_HASH_4WAY_H__
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <stddef.h>
|
||||
@@ -14,8 +14,7 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct ALIGN( 64 ) {
|
||||
__m512i b[16]; // input buffer
|
||||
@@ -23,12 +22,12 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_8way_ctx;
|
||||
} blake2b_8x64_ctx;
|
||||
|
||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
||||
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
|
||||
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
||||
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -41,12 +40,12 @@ typedef struct ALIGN( 64 ) {
|
||||
uint64_t t[2]; // total number of bytes
|
||||
size_t c; // pointer for b[]
|
||||
size_t outlen; // digest size
|
||||
} blake2b_4way_ctx;
|
||||
} blake2b_4x64_ctx;
|
||||
|
||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
||||
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
|
||||
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||
size_t inlen );
|
||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
||||
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
blake2b_8x64_init( &ctx );
|
||||
blake2b_8x64_update( &ctx, vdata, 80 );
|
||||
blake2b_8x64_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
blake2b_4x64_ctx ctx;
|
||||
blake2b_4x64_init( &ctx );
|
||||
blake2b_4x64_update( &ctx, input, 80 );
|
||||
blake2b_4x64_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
blake2b_4x64_init( &ctx );
|
||||
blake2b_4x64_update( &ctx, vdata, 80 );
|
||||
blake2b_4x64_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
|
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Blake2s-256 16 way
|
||||
|
||||
|
@@ -11,8 +11,8 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
//#pragma once
|
||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
||||
#ifndef BLAKE2S_HASH_4WAY_H__
|
||||
#define BLAKE2S_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
@@ -29,20 +29,20 @@
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_nway_param;
|
||||
typedef struct __blake2s_nway_param
|
||||
{
|
||||
uint8_t digest_length; // 1
|
||||
uint8_t key_length; // 2
|
||||
uint8_t fanout; // 3
|
||||
uint8_t depth; // 4
|
||||
uint32_t leaf_length; // 8
|
||||
uint8_t node_offset[6];// 14
|
||||
uint8_t node_depth; // 15
|
||||
uint8_t inner_length; // 16
|
||||
// uint8_t reserved[0];
|
||||
uint8_t salt[8]; // 24
|
||||
uint8_t personal[8]; // 32
|
||||
} blake2s_nway_param;
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||
{
|
||||
@@ -61,13 +61,18 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
#define blake2s_4x32_state blake2s_4way_state
|
||||
#define blake2s_4x32_init blake2s_4way_init
|
||||
#define blake2s_4x32_update blake2s_4way_update
|
||||
#define blake2s_4x32_final blake2s_4way_final
|
||||
#define blake2s_4x32_full_blocks blake2s_4way_full_blocks
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||
{
|
||||
__m256i h[8];
|
||||
uint8_t buf[ 32 * 8 ];
|
||||
uint8_t buf[ 64 * 8 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -81,14 +86,20 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
|
||||
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
||||
const void *input, uint64_t inlen );
|
||||
|
||||
#define blake2s_8x32_state blake2s_8way_state
|
||||
#define blake2s_8x32_init blake2s_8way_init
|
||||
#define blake2s_8x32_update blake2s_8way_update
|
||||
#define blake2s_8x32_final blake2s_8way_final
|
||||
#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||
{
|
||||
__m512i h[8];
|
||||
uint8_t buf[ 32 * 16 ];
|
||||
uint8_t buf[ 64 * 16 ];
|
||||
uint32_t t[2];
|
||||
uint32_t f[2];
|
||||
size_t buflen;
|
||||
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
|
||||
uint64_t inlen );
|
||||
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
|
||||
|
||||
#define blake2s_16x32_state blake2s_16way_state
|
||||
#define blake2s_16x32_init blake2s_16way_init
|
||||
#define blake2s_16x32_update blake2s_16way_update
|
||||
#define blake2s_16x32_final blake2s_16way_final
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
|
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 25 ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 11 ); \
|
||||
}
|
||||
|
||||
#define BLAKE512_ROUND( R ) \
|
||||
@@ -465,6 +465,7 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 += 1;
|
||||
|
||||
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -479,6 +480,7 @@ void blake512_close( blake512_context *sc, void *dst )
|
||||
uint64_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
memcpy( buf, sc->buf, ptr );
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr] = 0x80;
|
||||
tl = sc->T0 + bit_len;
|
||||
@@ -517,8 +519,6 @@ void blake512_close( blake512_context *sc, void *dst )
|
||||
*(uint64_t*)(buf + 120) = bswap_64( tl );
|
||||
blake512_update( sc, buf, 128 );
|
||||
}
|
||||
|
||||
//TODO vectored bswap
|
||||
|
||||
for ( k = 0; k < 8; k ++ )
|
||||
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
|
||||
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
////////////////////////////////////
|
||||
//
|
||||
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
VD = v512_64( CB5 ^ T0 ); \
|
||||
VE = v512_64( CB6 ^ T1 ); \
|
||||
VF = v512_64( CB7 ^ T1 ); \
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
M0 = mm512_bswap_64( *(buf+ 0) ); \
|
||||
M1 = mm512_bswap_64( *(buf+ 1) ); \
|
||||
M2 = mm512_bswap_64( *(buf+ 2) ); \
|
||||
M3 = mm512_bswap_64( *(buf+ 3) ); \
|
||||
M4 = mm512_bswap_64( *(buf+ 4) ); \
|
||||
M5 = mm512_bswap_64( *(buf+ 5) ); \
|
||||
M6 = mm512_bswap_64( *(buf+ 6) ); \
|
||||
M7 = mm512_bswap_64( *(buf+ 7) ); \
|
||||
M8 = mm512_bswap_64( *(buf+ 8) ); \
|
||||
M9 = mm512_bswap_64( *(buf+ 9) ); \
|
||||
MA = mm512_bswap_64( *(buf+10) ); \
|
||||
MB = mm512_bswap_64( *(buf+11) ); \
|
||||
MC = mm512_bswap_64( *(buf+12) ); \
|
||||
MD = mm512_bswap_64( *(buf+13) ); \
|
||||
ME = mm512_bswap_64( *(buf+14) ); \
|
||||
MF = mm512_bswap_64( *(buf+15) ); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
}
|
||||
|
||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
void blake512_8x64_compress( blake_8x64_big_context *sc )
|
||||
{
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
VE = v512_64( CB6 ^ sc->T1 );
|
||||
VF = v512_64( CB7 ^ sc->T1 );
|
||||
|
||||
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
||||
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
||||
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
||||
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
||||
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
||||
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
||||
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
||||
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
||||
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
||||
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
||||
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
||||
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
||||
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
||||
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
||||
M0 = mm512_bswap_64( sc->buf[ 0] );
|
||||
M1 = mm512_bswap_64( sc->buf[ 1] );
|
||||
M2 = mm512_bswap_64( sc->buf[ 2] );
|
||||
M3 = mm512_bswap_64( sc->buf[ 3] );
|
||||
M4 = mm512_bswap_64( sc->buf[ 4] );
|
||||
M5 = mm512_bswap_64( sc->buf[ 5] );
|
||||
M6 = mm512_bswap_64( sc->buf[ 6] );
|
||||
M7 = mm512_bswap_64( sc->buf[ 7] );
|
||||
M8 = mm512_bswap_64( sc->buf[ 8] );
|
||||
M9 = mm512_bswap_64( sc->buf[ 9] );
|
||||
MA = mm512_bswap_64( sc->buf[10] );
|
||||
MB = mm512_bswap_64( sc->buf[11] );
|
||||
MC = mm512_bswap_64( sc->buf[12] );
|
||||
MD = mm512_bswap_64( sc->buf[13] );
|
||||
ME = mm512_bswap_64( sc->buf[14] );
|
||||
MF = mm512_bswap_64( sc->buf[15] );
|
||||
|
||||
ROUND_B_8WAY(0);
|
||||
ROUND_B_8WAY(1);
|
||||
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
}
|
||||
|
||||
// won't be used after prehash implemented
|
||||
void blake512_8way_compress_le( blake_8x64_big_context *sc )
|
||||
void blake512_8x64_compress_le( blake_8x64_big_context *sc )
|
||||
{
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_8way_compress( sc );
|
||||
blake512_8x64_compress( sc );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
blake512_8way_compress( sc );
|
||||
blake512_8x64_compress( sc );
|
||||
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_8way_compress_le( sc );
|
||||
blake512_8x64_compress_le( sc );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
blake512_8way_compress_le( sc );
|
||||
blake512_8x64_compress_le( sc );
|
||||
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
|
||||
VD = v256_64( CB5 ^ T0 ); \
|
||||
VE = v256_64( CB6 ^ T1 ); \
|
||||
VF = v256_64( CB7 ^ T1 ); \
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
M0 = mm256_bswap_64( *(buf+ 0) ); \
|
||||
M1 = mm256_bswap_64( *(buf+ 1) ); \
|
||||
M2 = mm256_bswap_64( *(buf+ 2) ); \
|
||||
M3 = mm256_bswap_64( *(buf+ 3) ); \
|
||||
M4 = mm256_bswap_64( *(buf+ 4) ); \
|
||||
M5 = mm256_bswap_64( *(buf+ 5) ); \
|
||||
M6 = mm256_bswap_64( *(buf+ 6) ); \
|
||||
M7 = mm256_bswap_64( *(buf+ 7) ); \
|
||||
M8 = mm256_bswap_64( *(buf+ 8) ); \
|
||||
M9 = mm256_bswap_64( *(buf+ 9) ); \
|
||||
MA = mm256_bswap_64( *(buf+10) ); \
|
||||
MB = mm256_bswap_64( *(buf+11) ); \
|
||||
MC = mm256_bswap_64( *(buf+12) ); \
|
||||
MD = mm256_bswap_64( *(buf+13) ); \
|
||||
ME = mm256_bswap_64( *(buf+14) ); \
|
||||
MF = mm256_bswap_64( *(buf+15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
|
||||
}
|
||||
|
||||
|
||||
void blake512_4way_compress( blake_4x64_big_context *sc )
|
||||
void blake512_4x64_compress( blake_4x64_big_context *sc )
|
||||
{
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
|
||||
VD = v256_64( CB5 ^ sc->T0 );
|
||||
VE = v256_64( CB6 ^ sc->T1 );
|
||||
VF = v256_64( CB7 ^ sc->T1 );
|
||||
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
||||
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
||||
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
||||
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
||||
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
||||
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
||||
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
||||
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
||||
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
||||
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
||||
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
||||
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
||||
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
||||
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
||||
M0 = mm256_bswap_64( sc->buf[ 0] );
|
||||
M1 = mm256_bswap_64( sc->buf[ 1] );
|
||||
M2 = mm256_bswap_64( sc->buf[ 2] );
|
||||
M3 = mm256_bswap_64( sc->buf[ 3] );
|
||||
M4 = mm256_bswap_64( sc->buf[ 4] );
|
||||
M5 = mm256_bswap_64( sc->buf[ 5] );
|
||||
M6 = mm256_bswap_64( sc->buf[ 6] );
|
||||
M7 = mm256_bswap_64( sc->buf[ 7] );
|
||||
M8 = mm256_bswap_64( sc->buf[ 8] );
|
||||
M9 = mm256_bswap_64( sc->buf[ 9] );
|
||||
MA = mm256_bswap_64( sc->buf[10] );
|
||||
MB = mm256_bswap_64( sc->buf[11] );
|
||||
MC = mm256_bswap_64( sc->buf[12] );
|
||||
MD = mm256_bswap_64( sc->buf[13] );
|
||||
ME = mm256_bswap_64( sc->buf[14] );
|
||||
MF = mm256_bswap_64( sc->buf[15] );
|
||||
|
||||
ROUND_B_4WAY(0);
|
||||
ROUND_B_4WAY(1);
|
||||
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
|
||||
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
|
||||
void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
|
||||
const void *data )
|
||||
{
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
|
||||
midstate[15] = VF;
|
||||
}
|
||||
|
||||
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
||||
void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate )
|
||||
{
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
||||
}
|
||||
|
||||
|
||||
void blake512_4x64_init( blake_4x64_big_context *sc )
|
||||
void blake512_4x64_init( blake512_4x64_context *sc )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
|
||||
@@ -1779,13 +1770,11 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
|
||||
v256_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
@@ -1793,14 +1782,14 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
|
||||
buf[104>>3] = v256_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = v256_64( bswap_64( th ) );
|
||||
buf[120>>3] = v256_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
|
||||
// init, update & close
|
||||
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
||||
void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
|
||||
@@ -1826,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_4way_compress( sc );
|
||||
blake512_4x64_compress( sc );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
@@ -1861,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
blake512_4way_compress( sc );
|
||||
blake512_4x64_compress( sc );
|
||||
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
@@ -1889,13 +1878,13 @@ blake512_4x64_close(void *cc, void *dst)
|
||||
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 32 ); \
|
||||
d = v128_ror64xor( d, a, 32 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 25 ); \
|
||||
b = v128_ror64xor( b, c, 25 ); \
|
||||
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
||||
d = v128_ror64xor( d, a, 16 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 11 ); \
|
||||
b = v128_ror64xor( b, c, 11 ); \
|
||||
}
|
||||
|
||||
#define ROUND_B_2X64(r) \
|
||||
@@ -1936,48 +1925,23 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
|
||||
VE = v128_64( CB6 ^ sc->T1 );
|
||||
VF = v128_64( CB7 ^ sc->T1 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128u64_t shuf_bswap64 = v128_set64(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
|
||||
M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
|
||||
M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
|
||||
M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
|
||||
M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
|
||||
M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
|
||||
M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
|
||||
M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
|
||||
M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
|
||||
MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
|
||||
MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
|
||||
MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
|
||||
MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
|
||||
ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
|
||||
MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
|
||||
|
||||
#else // SSE2 & NEON
|
||||
|
||||
M0 = v128_bswap64( sc->buf[ 0] );
|
||||
M1 = v128_bswap64( sc->buf[ 0] );
|
||||
M2 = v128_bswap64( sc->buf[ 0] );
|
||||
M3 = v128_bswap64( sc->buf[ 0] );
|
||||
M4 = v128_bswap64( sc->buf[ 0] );
|
||||
M5 = v128_bswap64( sc->buf[ 0] );
|
||||
M6 = v128_bswap64( sc->buf[ 0] );
|
||||
M7 = v128_bswap64( sc->buf[ 0] );
|
||||
M8 = v128_bswap64( sc->buf[ 0] );
|
||||
M9 = v128_bswap64( sc->buf[ 0] );
|
||||
MA = v128_bswap64( sc->buf[ 0] );
|
||||
MB = v128_bswap64( sc->buf[ 0] );
|
||||
MC = v128_bswap64( sc->buf[ 0] );
|
||||
MD = v128_bswap64( sc->buf[ 0] );
|
||||
ME = v128_bswap64( sc->buf[ 0] );
|
||||
MF = v128_bswap64( sc->buf[ 0] );
|
||||
M1 = v128_bswap64( sc->buf[ 1] );
|
||||
M2 = v128_bswap64( sc->buf[ 2] );
|
||||
M3 = v128_bswap64( sc->buf[ 3] );
|
||||
M4 = v128_bswap64( sc->buf[ 4] );
|
||||
M5 = v128_bswap64( sc->buf[ 5] );
|
||||
M6 = v128_bswap64( sc->buf[ 6] );
|
||||
M7 = v128_bswap64( sc->buf[ 7] );
|
||||
M8 = v128_bswap64( sc->buf[ 8] );
|
||||
M9 = v128_bswap64( sc->buf[ 9] );
|
||||
MA = v128_bswap64( sc->buf[10] );
|
||||
MB = v128_bswap64( sc->buf[11] );
|
||||
MC = v128_bswap64( sc->buf[12] );
|
||||
MD = v128_bswap64( sc->buf[13] );
|
||||
ME = v128_bswap64( sc->buf[14] );
|
||||
MF = v128_bswap64( sc->buf[15] );
|
||||
|
||||
#endif
|
||||
|
||||
ROUND_B_2X64(0);
|
||||
ROUND_B_2X64(1);
|
||||
ROUND_B_2X64(2);
|
||||
@@ -2056,9 +2020,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
|
||||
// G4 skip nonce
|
||||
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
||||
V0 );
|
||||
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
|
||||
VF = v128_ror64xor( VF, V0, 32 );
|
||||
VA = v128_add64( VA, VF );
|
||||
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
|
||||
V5 = v128_ror64xor( V5, VA, 25 );
|
||||
V0 = v128_add64( V0, V5 );
|
||||
|
||||
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||
@@ -2139,9 +2103,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
||||
|
||||
// finish round 0, with the nonce now available
|
||||
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
||||
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
|
||||
VF = v128_ror64xor( VF, V0, 16 );
|
||||
VA = v128_add64( VA, VF );
|
||||
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
|
||||
V5 = v128_ror64xor( V5, VA, 11 );
|
||||
|
||||
// Round 1
|
||||
// G0
|
||||
@@ -2149,34 +2113,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
||||
|
||||
// G1
|
||||
V1 = v128_add64( V1, V5 );
|
||||
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
|
||||
VD = v128_ror64xor( VD, V1, 32 );
|
||||
V9 = v128_add64( V9, VD );
|
||||
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
|
||||
V5 = v128_ror64xor( V5, V9, 25 );
|
||||
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
||||
V5 ) );
|
||||
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
|
||||
VD = v128_ror64xor( VD, V1, 16 );
|
||||
V9 = v128_add64( V9, VD );
|
||||
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
|
||||
V5 = v128_ror64xor( V5, V9, 11 );
|
||||
|
||||
// G2
|
||||
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
||||
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
|
||||
VE = v128_ror64xor( VE, V2, 32 );
|
||||
VA = v128_add64( VA, VE );
|
||||
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
|
||||
V6 = v128_ror64xor( V6, VA, 25 );
|
||||
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
||||
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
|
||||
VE = v128_ror64xor( VE, V2, 16 );
|
||||
VA = v128_add64( VA, VE );
|
||||
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
|
||||
V6 = v128_ror64xor( V6, VA, 11 );
|
||||
|
||||
// G3
|
||||
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
|
||||
VF = v128_ror64xor( VF, V3, 32 );
|
||||
VB = v128_add64( VB, VF );
|
||||
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
|
||||
V7 = v128_ror64xor( V7, VB, 25 );
|
||||
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
||||
V7 ) );
|
||||
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
|
||||
VF = v128_ror64xor( VF, V3, 16 );
|
||||
VB = v128_add64( VB, VF );
|
||||
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
|
||||
V7 = v128_ror64xor( V7, VB, 11 );
|
||||
|
||||
// G4, G5, G6, G7
|
||||
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||
@@ -2235,7 +2199,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
|
||||
v128u64_t *buf;
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // sizeof/8
|
||||
DECL_STATE_2X64
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
@@ -2247,7 +2210,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
@@ -2260,13 +2222,12 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( (T0 = T0 + 1024 ) < 1024 )
|
||||
T1 = T1 + 1;
|
||||
if ( (sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_2x64_compress( sc );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
@@ -2280,37 +2241,35 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>3] = v128_64( 0x80 );
|
||||
sc->buf[ptr>>3] = v128_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
v128_memset_zero( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
buf[104>>3] = v128_or( buf[104>>3], v128_64( 0x0100000000000000ULL ) );
|
||||
buf[112>>3] = v128_64( bswap_64( th ) );
|
||||
buf[120>>3] = v128_64( bswap_64( tl ) );
|
||||
|
||||
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
|
||||
v128_memset_zero( sc->buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
sc->buf[104>>3] = v128_or( sc->buf[104>>3],
|
||||
v128_64( 0x0100000000000000ULL ) );
|
||||
sc->buf[112>>3] = v128_64( bswap_64( th ) );
|
||||
sc->buf[120>>3] = v128_64( bswap_64( tl ) );
|
||||
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
v128_memset_zero( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
|
||||
v128_memset_zero( sc->buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
v128_memset_zero( buf, 112>>3 );
|
||||
@@ -2319,6 +2278,7 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
|
||||
buf[120>>3] = v128_64( bswap_64( tl ) );
|
||||
blake64_2x64( sc, buf, 128 );
|
||||
}
|
||||
|
||||
v128_block_bswap64( (v128u64_t*)dst, sc->H );
|
||||
}
|
||||
|
||||
@@ -2326,7 +2286,6 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
|
||||
void blake512_2x64_full( blake_2x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
|
||||
// init
|
||||
|
||||
casti_v128u64( sc->H, 0 ) = v128_64( 0x6A09E667F3BCC908 );
|
||||
|
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
||||
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
||||
#define blake512_4way_final_le blake512_4x64_final_le
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
|
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
|
||||
blake256r8_4way_context blakecoin_4w_ctx;
|
||||
blake256r8_4x32_context blakecoin_4w_ctx;
|
||||
|
||||
void blakecoin_4way_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r8_4way_context ctx;
|
||||
blake256r8_4x32_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
|
||||
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4x32_close( &ctx, vhash );
|
||||
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
@@ -178,11 +178,11 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
HTarget = 0x7f;
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||
blake256r8_4x32_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_4way_hash( hash, vdata );
|
||||
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BLAKECOIN_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
|
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake512_4way_context ctx;
|
||||
blake512_4x64_context ctx;
|
||||
|
||||
blake512_4x64_init( &ctx );
|
||||
blake512_4x64_update( &ctx, input, 80 );
|
||||
blake512_4x64_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, input, 80 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
blake512_4x64_init( &ctx );
|
||||
blake512_4x64_update( &ctx, vhash, 64 );
|
||||
blake512_4x64_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
blake512_4x64_init( &ctx );
|
||||
blake512_4x64_update( &ctx, vhash, 64 );
|
||||
blake512_4x64_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
blake512_4x64_init( &ctx );
|
||||
blake512_4x64_update( &ctx, vhash, 64 );
|
||||
blake512_4x64_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
blake512_4x64_init( &ctx );
|
||||
blake512_4x64_update( &ctx, vhash, 64 );
|
||||
blake512_4x64_close( &ctx, vhash );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
|
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
v128_t *V = (v128_t*)v;
|
||||
|
||||
@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
V[3] = v128_swap64( V[3] ); \
|
||||
V[2] = v128_shufll32( V[2] )
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
BLAKE2S_ROUND(2);
|
||||
BLAKE2S_ROUND(3);
|
||||
BLAKE2S_ROUND(4);
|
||||
BLAKE2S_ROUND(5);
|
||||
BLAKE2S_ROUND(6);
|
||||
BLAKE2S_ROUND(7);
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
#undef BLAKE2S_ROUND
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND(r) \
|
||||
#define BLAKE2S_ROUND(r) \
|
||||
do { \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
||||
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
ROUND( 4 );
|
||||
ROUND( 5 );
|
||||
ROUND( 6 );
|
||||
ROUND( 7 );
|
||||
ROUND( 8 );
|
||||
ROUND( 9 );
|
||||
|
||||
#endif
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
BLAKE2S_ROUND(2);
|
||||
BLAKE2S_ROUND(3);
|
||||
BLAKE2S_ROUND(4);
|
||||
BLAKE2S_ROUND(5);
|
||||
BLAKE2S_ROUND(6);
|
||||
BLAKE2S_ROUND(7);
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
#undef G
|
||||
#undef ROUND
|
||||
#undef BLAKE2S_ROUND
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -101,15 +101,15 @@
|
||||
{ \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 24 ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
||||
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
||||
Vb = v128_ror64xor( Vb, Vc, 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
@@ -131,47 +131,7 @@
|
||||
V[7] = v128_alignr64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
v128_t *V = (v128_t*)v; \
|
||||
v128_t V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
|
@@ -39,16 +39,14 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
// BMW-256 4 way 32
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v128_t buf[64];
|
||||
v128_t H[16];
|
||||
v128u32_t buf[64];
|
||||
v128u32_t H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_4way_small_context;
|
||||
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||
|
||||
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw256_4way bmw256_4way_update
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#define bmw256_4x32_context bmw256_4way_context
|
||||
#define bmw256_4x32_init bmw256_4way_init
|
||||
#define bmw256_4x32_update bmw256_4way_update
|
||||
#define bmw256_4x32_close bmw256_4way_close
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-256 8 way 32
|
||||
@@ -85,9 +89,14 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||
#define bmw256_8way bmw256_8way_update
|
||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#define bmw256_8x32_context bmw256_8way_context
|
||||
#define bmw256_8x32_init bmw256_8way_init
|
||||
#define bmw256_8x32_update bmw256_8way_update
|
||||
#define bmw256_8x32_close bmw256_8way_close
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||
|
||||
#define bmw256_16x32_context bmw256_16way_context
|
||||
#define bmw256_16x32_init bmw256_16way_init
|
||||
#define bmw256_16x32_update bmw256_16way_update
|
||||
#define bmw256_16x32_close bmw256_16way_close
|
||||
|
||||
#endif
|
||||
|
||||
// BMW-512 2 way 64
|
||||
@@ -157,7 +171,7 @@ void bmw512_4way_addbits_and_close(
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-512 64 bit 8 way
|
||||
typedef struct
|
||||
|
@@ -45,7 +45,7 @@ extern "C"{
|
||||
|
||||
#define LPAR (
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
// BMW-256 4 way 32
|
||||
/*
|
||||
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
|
||||
*/
|
||||
|
||||
#define ss0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 3) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 4), \
|
||||
mm128_rol_32( (x), 19) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 3) ), \
|
||||
v128_xor( v128_rol32( (x), 4), \
|
||||
v128_rol32( (x), 19) ) )
|
||||
|
||||
#define ss1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 8), \
|
||||
mm128_rol_32( (x), 23) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 8), \
|
||||
v128_rol32( (x), 23) ) )
|
||||
|
||||
#define ss2(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 1) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 12), \
|
||||
mm128_rol_32( (x), 25) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 1) ), \
|
||||
v128_xor( v128_rol32( (x), 12), \
|
||||
v128_rol32( (x), 25) ) )
|
||||
|
||||
#define ss3(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 15), \
|
||||
mm128_rol_32( (x), 29) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 15), \
|
||||
v128_rol32( (x), 29) ) )
|
||||
|
||||
#define ss4(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 1 ) )
|
||||
|
||||
#define ss5(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 2 ) )
|
||||
|
||||
#define rs1(x) mm128_rol_32( x, 3 )
|
||||
#define rs2(x) mm128_rol_32( x, 7 )
|
||||
#define rs3(x) mm128_rol_32( x, 13 )
|
||||
#define rs4(x) mm128_rol_32( x, 16 )
|
||||
#define rs5(x) mm128_rol_32( x, 19 )
|
||||
#define rs6(x) mm128_rol_32( x, 23 )
|
||||
#define rs7(x) mm128_rol_32( x, 27 )
|
||||
#define rs1(x) v128_rol32( x, 3 )
|
||||
#define rs2(x) v128_rol32( x, 7 )
|
||||
#define rs3(x) v128_rol32( x, 13 )
|
||||
#define rs4(x) v128_rol32( x, 16 )
|
||||
#define rs5(x) v128_rol32( x, 19 )
|
||||
#define rs6(x) v128_rol32( x, 23 )
|
||||
#define rs7(x) v128_rol32( x, 27 )
|
||||
|
||||
#define rol_off_32( M, j, off ) \
|
||||
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_s( M, H, j ) \
|
||||
_mm_xor_si128( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
v128_xor( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1s( qt, M, H, i ) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2s( qt, M, H, i) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
|
||||
// resulting in some sign changes compared to the reference code.
|
||||
|
||||
#define Ws0 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_add32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws1 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_xor( M[11], H[11] ) ), \
|
||||
v128_sub32( v128_xor( M[14], H[14] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws2 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws3 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_sub32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws4 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws5 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws6 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_sub32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws7 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_add32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws8 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
#define Ws9 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws10 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws11 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define Ws12 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[10], H[10] ) ) )
|
||||
|
||||
#define Ws13 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_add32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[11], H[11] ) ) )
|
||||
|
||||
#define Ws14 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[12], H[12] ) ) )
|
||||
|
||||
#define Ws15 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[ 4], H[4] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
|
||||
{
|
||||
__m128i qt[32], xl, xh; \
|
||||
v128u32_t qt[32], xl, xh; \
|
||||
|
||||
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
|
||||
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
|
||||
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
|
||||
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
|
||||
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
|
||||
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
|
||||
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = v128_add32( ss0( Ws10), H[11] );
|
||||
qt[11] = v128_add32( ss1( Ws11), H[12] );
|
||||
qt[12] = v128_add32( ss2( Ws12), H[13] );
|
||||
qt[13] = v128_add32( ss3( Ws13), H[14] );
|
||||
qt[14] = v128_add32( ss4( Ws14), H[15] );
|
||||
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
|
||||
qt[16] = expand1s( qt, M, H, 16 );
|
||||
qt[17] = expand1s( qt, M, H, 17 );
|
||||
qt[18] = expand2s( qt, M, H, 18 );
|
||||
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
qt[30] = expand2s( qt, M, H, 30 );
|
||||
qt[31] = expand2s( qt, M, H, 31 );
|
||||
|
||||
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm_xor_si128( xl, _mm_xor_si128(
|
||||
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = v128_xor( xl, v128_xor(
|
||||
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[0],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
|
||||
_mm_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[1],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
|
||||
_mm_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[2],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
|
||||
_mm_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[3],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
|
||||
_mm_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[4],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
|
||||
_mm_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[5],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
|
||||
_mm_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[6],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
|
||||
_mm_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[7],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
|
||||
_mm_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[4], 9 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
|
||||
_mm_xor_si128( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[5], 10 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[6], 11 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[7], 12 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[0], 13 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
|
||||
_mm_xor_si128( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[1], 14 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[2], 15 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
|
||||
_mm_xor_si128( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[3], 16 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
|
||||
_mm_xor_si128( qt[22], qt[15] ) ) );
|
||||
dH[ 0] = v128_add32(
|
||||
v128_xor( M[0],
|
||||
v128_xor( v128_sl32( xh, 5 ),
|
||||
v128_sr32( qt[16], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = v128_add32(
|
||||
v128_xor( M[1],
|
||||
v128_xor( v128_sr32( xh, 7 ),
|
||||
v128_sl32( qt[17], 8 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = v128_add32(
|
||||
v128_xor( M[2],
|
||||
v128_xor( v128_sr32( xh, 5 ),
|
||||
v128_sl32( qt[18], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = v128_add32(
|
||||
v128_xor( M[3],
|
||||
v128_xor( v128_sr32( xh, 1 ),
|
||||
v128_sl32( qt[19], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = v128_add32(
|
||||
v128_xor( M[4],
|
||||
v128_xor( v128_sr32( xh, 3 ),
|
||||
v128_sl32( qt[20], 0 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = v128_add32(
|
||||
v128_xor( M[5],
|
||||
v128_xor( v128_sl32( xh, 6 ),
|
||||
v128_sr32( qt[21], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = v128_add32(
|
||||
v128_xor( M[6],
|
||||
v128_xor( v128_sr32( xh, 4 ),
|
||||
v128_sl32( qt[22], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = v128_add32(
|
||||
v128_xor( M[7],
|
||||
v128_xor( v128_sr32( xh, 11 ),
|
||||
v128_sl32( qt[23], 2 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[4], 9 ),
|
||||
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
|
||||
v128_xor( v128_sl32( xl, 8 ),
|
||||
v128_xor( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[5], 10 ),
|
||||
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
|
||||
v128_xor( v128_sr32( xl, 6 ),
|
||||
v128_xor( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[6], 11 ),
|
||||
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
|
||||
v128_xor( v128_sl32( xl, 6 ),
|
||||
v128_xor( qt[17], qt[10] ) ) );
|
||||
dH[11] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[7], 12 ),
|
||||
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
|
||||
v128_xor( v128_sl32( xl, 4 ),
|
||||
v128_xor( qt[18], qt[11] ) ) );
|
||||
dH[12] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[0], 13 ),
|
||||
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
|
||||
v128_xor( v128_sr32( xl, 3 ),
|
||||
v128_xor( qt[19], qt[12] ) ) );
|
||||
dH[13] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[1], 14 ),
|
||||
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
|
||||
v128_xor( v128_sr32( xl, 4 ),
|
||||
v128_xor( qt[20], qt[13] ) ) );
|
||||
dH[14] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[2], 15 ),
|
||||
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
|
||||
v128_xor( v128_sr32( xl, 7 ),
|
||||
v128_xor( qt[21], qt[14] ) ) );
|
||||
dH[15] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[3], 16 ),
|
||||
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
|
||||
v128_xor( v128_sr32( xl, 2 ),
|
||||
v128_xor( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
static const uint32_t final_s[16][4] =
|
||||
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
|
||||
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
|
||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||
};
|
||||
/*
|
||||
static const __m128i final_s[16] =
|
||||
{
|
||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
|
||||
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
|
||||
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
|
||||
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
|
||||
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
|
||||
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
|
||||
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
|
||||
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
|
||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
|
||||
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
|
||||
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
|
||||
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
|
||||
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
|
||||
};
|
||||
*/
|
||||
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
||||
|
||||
|
||||
// for ( int i = 0; i < 16; i++ )
|
||||
// sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
ctx->H[ 0] = v128_32( 0x40414243 );
|
||||
ctx->H[ 1] = v128_32( 0x44454647 );
|
||||
ctx->H[ 2] = v128_32( 0x48494A4B );
|
||||
ctx->H[ 3] = v128_32( 0x4C4D4E4F );
|
||||
ctx->H[ 4] = v128_32( 0x50515253 );
|
||||
ctx->H[ 5] = v128_32( 0x54555657 );
|
||||
ctx->H[ 6] = v128_32( 0x58595A5B );
|
||||
ctx->H[ 7] = v128_32( 0x5C5D5E5F );
|
||||
ctx->H[ 8] = v128_32( 0x60616263 );
|
||||
ctx->H[ 9] = v128_32( 0x64656667 );
|
||||
ctx->H[10] = v128_32( 0x68696A6B );
|
||||
ctx->H[11] = v128_32( 0x6C6D6E6F );
|
||||
ctx->H[12] = v128_32( 0x70717273 );
|
||||
ctx->H[13] = v128_32( 0x74757677 );
|
||||
ctx->H[14] = v128_32( 0x78797A7B );
|
||||
ctx->H[15] = v128_32( 0x7C7D7E7F );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
static void
|
||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
__m128i *buf;
|
||||
__m128i htmp[16];
|
||||
__m128i *h1, *h2;
|
||||
v128u32_t *vdata = (v128u32_t*)data;
|
||||
v128u32_t *buf;
|
||||
v128u32_t htmp[16];
|
||||
v128u32_t *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
@@ -497,13 +473,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
vdata += ( clen >> 2 );
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m128i *ht;
|
||||
v128u32_t *ht;
|
||||
compress_small( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
@@ -513,46 +489,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
|
||||
|
||||
if ( h1 != sc->H )
|
||||
memcpy_128( sc->H, h1, 16 );
|
||||
v128_memcpy( sc->H, h1, 16 );
|
||||
}
|
||||
|
||||
static void
|
||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32)
|
||||
{
|
||||
__m128i *buf;
|
||||
__m128i h1[16], h2[16], *h;
|
||||
v128u32_t *buf;
|
||||
v128u32_t h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = v128_32( 0x00000080 );
|
||||
ptr += 4;
|
||||
h = sc->H;
|
||||
|
||||
// assume bit_count fits in 32 bits
|
||||
if ( ptr > buf_size - 4 )
|
||||
{
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
compress_small( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = m128_zero;
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = v128_zero;
|
||||
compress_small( buf, h, h2 );
|
||||
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
|
||||
compress_small( buf, (__m128i*)final_s, h1 );
|
||||
compress_small( buf, (v128u32_t*)final_s, h1 );
|
||||
|
||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||
casti_m128i( dst, u ) = h1[v];
|
||||
casti_v128( dst, u ) = h1[v];
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1058,7 +1033,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
|
@@ -2,12 +2,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
//#include "sph_keccak.h"
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way(void *state, const void *input)
|
||||
void bmw512hash_8way( void *state, const void *input )
|
||||
{
|
||||
bmw512_8way_context ctx;
|
||||
bmw512_8way_init( &ctx );
|
||||
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do {
|
||||
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
|
||||
void bmw512hash_4way(void *state, const void *input)
|
||||
void bmw512hash_4way( void *state, const void *input )
|
||||
{
|
||||
bmw512_4way_context ctx;
|
||||
bmw512_4way_init( &ctx );
|
||||
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
do {
|
||||
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BMW512_2WAY)
|
||||
|
||||
void bmw512hash_2x64( void *state, const void *input )
|
||||
{
|
||||
bmw512_2x64_context ctx;
|
||||
bmw512_2x64_init( &ctx );
|
||||
bmw512_2x64_update( &ctx, input, 80 );
|
||||
bmw512_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
do {
|
||||
*noncev = v128_intrlv_blend_32( v128_bswap32(
|
||||
v128_set32( n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
bmw512hash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
bool register_bmw512_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
#if defined (BMW512_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
|
||||
#elif defined (BMW512_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||
gate->hash = (void*)&bmw512hash_4way;
|
||||
#elif defined (BMW512_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_bmw512_2x64;
|
||||
gate->hash = (void*)&bmw512hash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_bmw512;
|
||||
gate->hash = (void*)&bmw512hash;
|
||||
|
@@ -4,23 +4,31 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define BMW512_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define BMW512_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define BMW512_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(BMW512_8WAY)
|
||||
|
||||
void bmw512hash_8way( void *state, const void *input );
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
void bmw512hash_4way( void *state, const void *input );
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BMW512_2WAY)
|
||||
|
||||
void bmw512hash_2x64( void *state, const void *input );
|
||||
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
|
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// BMW-512 8 WAY
|
||||
|
||||
|
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||
// If reinterleaving is necessary it may be more efficient to use
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
|
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
register __m512i x0, x1;
|
||||
|
||||
@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
register __m256i x0, x1, x2, x3, y0, y1;
|
||||
register __m256i x0, x1, x2, x3, t0;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->x );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );
|
||||
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
|
||||
{
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = mm256_rol_32( x1, 7 );
|
||||
y1 = mm256_rol_32( x0, 7 );
|
||||
x0 = _mm256_xor_si256( y0, x2 );
|
||||
x1 = _mm256_xor_si256( y1, x3 );
|
||||
t0 = mm256_rol_32( x1, 7 );
|
||||
x1 = mm256_rol_32( x0, 7 );
|
||||
x0 = _mm256_xor_si256( t0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = mm256_swap128_64( x2 );
|
||||
x3 = mm256_swap128_64( x3 );
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )
|
||||
|
||||
#else // AVX, SSE2, NEON
|
||||
|
||||
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
|
||||
v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;
|
||||
|
||||
x0 = casti_v128( sp->x, 0 );
|
||||
x1 = casti_v128( sp->x, 1 );
|
||||
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
|
||||
x5 = v128_add32( x1, x5 );
|
||||
x6 = v128_add32( x2, x6 );
|
||||
x7 = v128_add32( x3, x7 );
|
||||
y0 = x2;
|
||||
y1 = x3;
|
||||
y2 = x0;
|
||||
y3 = x1;
|
||||
x0 = v128_rol32( y0, 7 );
|
||||
x1 = v128_rol32( y1, 7 );
|
||||
x2 = v128_rol32( y2, 7 );
|
||||
x3 = v128_rol32( y3, 7 );
|
||||
x0 = v128_xor( x0, x4 );
|
||||
x1 = v128_xor( x1, x5 );
|
||||
t0 = v128_rol32( x2, 7 );
|
||||
t1 = v128_rol32( x3, 7 );
|
||||
x2 = v128_rol32( x0, 7 );
|
||||
x3 = v128_rol32( x1, 7 );
|
||||
x0 = v128_xor( t0, x4 );
|
||||
x1 = v128_xor( t1, x5 );
|
||||
x2 = v128_xor( x2, x6 );
|
||||
x3 = v128_xor( x3, x7 );
|
||||
x4 = v128_swap64( x4 );
|
||||
@@ -112,19 +108,15 @@ static void transform( cubehashParam *sp )
|
||||
x5 = v128_add32( x1, x5 );
|
||||
x6 = v128_add32( x2, x6 );
|
||||
x7 = v128_add32( x3, x7 );
|
||||
y0 = x1;
|
||||
y1 = x0;
|
||||
y2 = x3;
|
||||
y3 = x2;
|
||||
x0 = v128_rol32( y0, 11 );
|
||||
x1 = v128_rol32( y1, 11 );
|
||||
x2 = v128_rol32( y2, 11 );
|
||||
x3 = v128_rol32( y3, 11 );
|
||||
x0 = v128_xor( x0, x4 );
|
||||
x1 = v128_xor( x1, x5 );
|
||||
x2 = v128_xor( x2, x6 );
|
||||
x3 = v128_xor( x3, x7 );
|
||||
x4 = v128_swap64_32( x4 );
|
||||
t0 = v128_rol32( x1, 11 );
|
||||
x1 = v128_rol32( x0, 11 );
|
||||
t1 = v128_rol32( x3, 11 );
|
||||
x3 = v128_rol32( x2, 11 );
|
||||
x0 = v128_xor( t0, x4 );
|
||||
x1 = v128_xor( x1, x5 );
|
||||
x2 = v128_xor( t1, x6 );
|
||||
x3 = v128_xor( x3, x7 );
|
||||
x4 = v128_swap64_32( x4 );
|
||||
x5 = v128_swap64_32( x5 );
|
||||
x6 = v128_swap64_32( x6 );
|
||||
x7 = v128_swap64_32( x7 );
|
||||
|
@@ -21,112 +21,92 @@
|
||||
#include "hash_api.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
|
||||
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
|
||||
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
|
||||
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
|
||||
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
|
||||
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
|
||||
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
|
||||
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
|
||||
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
|
||||
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
|
||||
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
|
||||
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
|
||||
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
|
||||
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
|
||||
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
|
||||
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
|
||||
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
|
||||
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
|
||||
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
|
||||
const uint32_t const1[] __attribute__ ((aligned (32))) =
|
||||
{ 0x00000001, 0x00000000, 0x00000000, 0x00000000 };
|
||||
const uint32_t mul2mask[] __attribute__ ((aligned (16))) =
|
||||
{ 0x00001b00, 0x00000000, 0x00000000, 0x00000000 };
|
||||
const uint32_t lsbmask[] __attribute__ ((aligned (16))) =
|
||||
{ 0x01010101, 0x01010101, 0x01010101, 0x01010101 };
|
||||
const uint32_t invshiftrows[] __attribute__ ((aligned (16))) =
|
||||
{ 0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c };
|
||||
|
||||
#define ECHO_SUBBYTES4( state, j ) \
|
||||
state[0][j] = v128_aesenc( state[0][j], k1 ); \
|
||||
k1 = v128_add32( k1, cast_v128(const1) ); \
|
||||
state[1][j] = v128_aesenc( state[1][j], k1 ); \
|
||||
k1 = v128_add32( k1, cast_v128(const1) ); \
|
||||
state[2][j] = v128_aesenc( state[2][j], k1 ); \
|
||||
k1 = v128_add32( k1, cast_v128(const1) ); \
|
||||
state[3][j] = v128_aesenc( state[3][j], k1 ); \
|
||||
k1 = v128_add32( k1, cast_v128(const1) ); \
|
||||
state[0][j] = v128_aesenc_nokey( state[0][j] ); \
|
||||
state[1][j] = v128_aesenc_nokey( state[1][j] ); \
|
||||
state[2][j] = v128_aesenc_nokey( state[2][j] ); \
|
||||
state[3][j] = v128_aesenc_nokey( state[3][j] )
|
||||
|
||||
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
|
||||
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
#define ECHO_SUBBYTES( state, i, j ) \
|
||||
state[i][j] = v128_aesenc( state[i][j], k1 ); \
|
||||
k1 = v128_add32( k1, cast_v128(const1) ); \
|
||||
state[i][j] = v128_aesenc_nokey( state[i][j] )
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = v128_aesenc(state[0][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[1][j] = v128_aesenc(state[1][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[2][j] = v128_aesenc(state[2][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[3][j] = v128_aesenc(state[3][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
|
||||
state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
|
||||
state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
|
||||
state[3][j] = v128_aesenc(state[3][j], v128_zero )
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = v128_aesenc(state[i][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = v128_add8(state1[0][j], state1[0][j]);\
|
||||
t1 = v128_sr16(state1[0][j], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = s2;\
|
||||
state2[1][j] = state1[0][j];\
|
||||
state2[2][j] = state1[0][j];\
|
||||
state2[3][j] = v128_xor(s2, state1[0][j]);\
|
||||
s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = v128_xor(state2[1][j], s2);\
|
||||
state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = v128_xor(state2[2][j], s2);\
|
||||
state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = v128_xor(state2[3][j], s2)
|
||||
#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) \
|
||||
s2 = v128_add8( state1[0][j], state1[0][j] ); \
|
||||
t1 = v128_sr16( state1[0][j], 7 ); \
|
||||
t1 = v128_and( t1, cast_v128(lsbmask) ); \
|
||||
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
|
||||
s2 = v128_xor( s2, t2 ); \
|
||||
state2[0][j] = s2; \
|
||||
state2[1][j] = state1[0][j]; \
|
||||
state2[2][j] = state1[0][j]; \
|
||||
state2[3][j] = v128_xor(s2, state1[0][j] ); \
|
||||
s2 = v128_add8( state1[1][(j + 1) & 3], state1[1][(j + 1) & 3] ); \
|
||||
t1 = v128_sr16( state1[1][(j + 1) & 3], 7 ); \
|
||||
t1 = v128_and( t1, cast_v128(lsbmask) ); \
|
||||
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
|
||||
s2 = v128_xor( s2, t2 ); \
|
||||
state2[0][j] = v128_xor3( state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = v128_xor( state2[1][j], s2 ); \
|
||||
state2[2][j] = v128_xor( state2[2][j], state1[1][(j + 1) & 3] ); \
|
||||
state2[3][j] = v128_xor( state2[3][j], state1[1][(j + 1) & 3] ); \
|
||||
s2 = v128_add8( state1[2][(j + 2) & 3], state1[2][(j + 2) & 3] ); \
|
||||
t1 = v128_sr16( state1[2][(j + 2) & 3], 7 ); \
|
||||
t1 = v128_and( t1, cast_v128(lsbmask) ); \
|
||||
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
|
||||
s2 = v128_xor( s2, t2 ); \
|
||||
state2[0][j] = v128_xor( state2[0][j], state1[2][(j + 2) & 3] ); \
|
||||
state2[1][j] = v128_xor3( state2[1][j], s2, state1[2][(j + 2) & 3] ); \
|
||||
state2[2][j] = v128_xor( state2[2][j], s2 ); \
|
||||
state2[3][j] = v128_xor( state2[3][j], state1[2][(j + 2) & 3] ); \
|
||||
s2 = v128_add8( state1[3][(j + 3) & 3], state1[3][(j + 3) & 3] ); \
|
||||
t1 = v128_sr16( state1[3][(j + 3) & 3], 7 ); \
|
||||
t1 = v128_and( t1, cast_v128(lsbmask) ); \
|
||||
t2 = v128_shuffle8( cast_v128(mul2mask), t1 ); \
|
||||
s2 = v128_xor( s2, t2 ); \
|
||||
state2[0][j] = v128_xor( state2[0][j], state1[3][(j + 3) & 3] ); \
|
||||
state2[1][j] = v128_xor( state2[1][j], state1[3][(j + 3) & 3] ); \
|
||||
state2[2][j] = v128_xor3( state2[2][j], s2, state1[3][(j + 3) & 3] ); \
|
||||
state2[3][j] = v128_xor( state2[3][j], s2 )
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES4(_state, 0);\
|
||||
ECHO_SUBBYTES4(_state, 1);\
|
||||
ECHO_SUBBYTES4(_state, 2);\
|
||||
ECHO_SUBBYTES4(_state, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES4(_state2, 0);\
|
||||
ECHO_SUBBYTES4(_state2, 1);\
|
||||
ECHO_SUBBYTES4(_state2, 2);\
|
||||
ECHO_SUBBYTES4(_state2, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
{ \
|
||||
ECHO_SUBBYTES4( _state, 0 ); \
|
||||
ECHO_SUBBYTES4( _state, 1 ); \
|
||||
ECHO_SUBBYTES4( _state, 2 ); \
|
||||
ECHO_SUBBYTES4( _state, 3 ); \
|
||||
ECHO_MIXBYTES( _state, _state2, 0, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state, _state2, 1, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state, _state2, 2, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state, _state2, 3, t1, t2, s2 ); \
|
||||
ECHO_SUBBYTES4( _state2, 0 ); \
|
||||
ECHO_SUBBYTES4( _state2, 1 ); \
|
||||
ECHO_SUBBYTES4( _state2, 2 ); \
|
||||
ECHO_SUBBYTES4( _state2, 3 ); \
|
||||
ECHO_MIXBYTES( _state2, _state, 0, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state2, _state, 1, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state2, _state, 2, t1, t2, s2 ); \
|
||||
ECHO_MIXBYTES( _state2, _state, 3, t1, t2, s2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
@@ -256,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
@@ -300,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
||||
HashReturn update_echo( hashState_echo *state, const void *data,
|
||||
uint32_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -350,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
HashReturn final_echo( hashState_echo *state, void *hashval)
|
||||
{
|
||||
v128_t remainingbits;
|
||||
|
||||
@@ -427,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||
const void *data, uint32_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -550,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength datalen )
|
||||
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||
int nHashSize, const void *data, uint32_t datalen )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
@@ -598,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
@@ -621,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
memcpy(state->buffer, data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
@@ -709,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
HashReturn hRet;
|
||||
@@ -766,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
|
||||
|
||||
HashReturn reinit_echo(hashState_echo *state);
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
|
||||
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
|
||||
HashReturn final_echo(hashState_echo *state, void *hashval);
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
|
||||
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength databitlen );
|
||||
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||
const void *data, uint32_t databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||
int nHashSize, const void *data, uint32_t databitlen );
|
||||
|
||||
#endif // HASH_API_H
|
||||
|
||||
|
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
};
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||
|
@@ -5,7 +5,7 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -36,7 +36,6 @@
|
||||
|
||||
#include "sph_echo.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
|
@@ -36,8 +36,6 @@
|
||||
#ifndef SPH_ECHO_H__
|
||||
#define SPH_ECHO_H__
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -15,237 +15,176 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "fugue-aesni.h"
|
||||
|
||||
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
|
||||
{ 0x0202010807020100, 0x0a05000f06010c0b };
|
||||
|
||||
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
|
||||
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
|
||||
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
|
||||
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
|
||||
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
|
||||
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
|
||||
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
|
||||
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
||||
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
||||
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
||||
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
||||
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
||||
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
|
||||
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
|
||||
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
|
||||
{ 0x0402060c070d0003, 0x090a060580808080 };
|
||||
|
||||
MYALIGN const unsigned int _IV512[] = {
|
||||
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
|
||||
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
|
||||
|
||||
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
|
||||
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
|
||||
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
|
||||
|
||||
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
|
||||
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
|
||||
{ 0x0706050403020000, 0x0302000007060504 };
|
||||
|
||||
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
|
||||
{ 0x010c0b060d080702, 0x0904030e03000104 };
|
||||
|
||||
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
|
||||
{ 0x8080808080808080, 0x0504070605040f06 };
|
||||
|
||||
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
|
||||
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
|
||||
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
|
||||
{ 0x000000001b1b0000, 0x0000000000000000 };
|
||||
|
||||
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
|
||||
{ 0x000000002d361b00, 0x0000000000000000 };
|
||||
|
||||
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
|
||||
{ 0x0303030303030303, 0x0303030303030303 };
|
||||
|
||||
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
|
||||
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
|
||||
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
|
||||
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
|
||||
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
|
||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
|
||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
|
||||
};
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
|
||||
#define mask_1000(v) v128_put32( v, 0, 3 )
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
|
||||
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = s1;\
|
||||
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1);
|
||||
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
|
||||
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
|
||||
|
||||
#else // SSE2
|
||||
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
|
||||
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);
|
||||
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
|
||||
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
|
||||
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
|
||||
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = _mm_shuffle_epi32(s1, 0xf9);\
|
||||
t2 = _mm_shuffle_epi32(s2, 0xcf);\
|
||||
t1 = _mm_xor_si128(t1, t2);\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1)
|
||||
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
|
||||
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
|
||||
t1 = v128_xor( t1, t2 ); \
|
||||
r1 = v128_xor( r1, t1 ); \
|
||||
r2 = v128_xor( r2, t1 );
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define mask_1000(v) v128_mask32( v, 8 )
|
||||
|
||||
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
|
||||
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
|
||||
|
||||
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||
t1 = s1; \
|
||||
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
|
||||
r1 = v128_xor( r1, t1 ); \
|
||||
r2 = v128_xor( r2, t1 );
|
||||
|
||||
#endif
|
||||
|
||||
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s10 = _mm_xor_si128(s10, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1)
|
||||
#define PACK_S0( s0, s1, t1 ) \
|
||||
s0 = v128_movlane32( s0, 3, s1, 0 )
|
||||
|
||||
|
||||
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s16 = _mm_xor_si128(s16, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1)
|
||||
#define UNPACK_S0( s0, s1, t1 ) \
|
||||
s1 = v128_movlane32( s1, 0, s0, 3 ); \
|
||||
s0 = mask_1000( s0 )
|
||||
|
||||
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s22 = _mm_xor_si128(s22, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
t1 = shuffle_3303( s0 ); \
|
||||
s22 = v128_xor(s22, t1);\
|
||||
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
|
||||
s0 = v128_movlane32( s0, 0, t1, 0 ); \
|
||||
t1 = v128_alignr64( t1, v128_zero, 1 ); \
|
||||
s8 = v128_xor(s8, t1);\
|
||||
t1 = shuffle_3303( s24 ); \
|
||||
s0 = v128_xor(s0, t1);\
|
||||
t1 = shuffle_3303( s27 ); \
|
||||
s4 = v128_xor(s4, t1);\
|
||||
t1 = shuffle_3303( s30 ); \
|
||||
s7 = v128_xor(s7, t1)
|
||||
|
||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
t2 = _mm_add_epi8(s2, s2);\
|
||||
t1 = _mm_srli_epi16(x, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
*/
|
||||
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
#define SUBSTITUTE( r0, _t2 ) \
|
||||
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
|
||||
_t2 = v128_aesenclast_nokey( _t2 )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
*/
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = _mm_xor_si128(t4, t0);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
t3 = v128_add8( t0, t0 ); \
|
||||
t4 = v128_add8( t3, t3 ); \
|
||||
t1 = v128_sr16( t0, 6 ); \
|
||||
t1 = v128_and( t1, _lsbmask2 ); \
|
||||
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
|
||||
t4 = v128_shuffle8( t2, _supermix1b ); \
|
||||
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
|
||||
t1 = v128_shuffle8( t4, _supermix1c ); \
|
||||
t4 = v128_xor( t4, t1 ); \
|
||||
t1 = v128_shuffle8( t4, _supermix1d ); \
|
||||
t4 = v128_xor( t4, t1 ); \
|
||||
t1 = v128_shuffle8( t2, _supermix1a ); \
|
||||
t2 = v128_xor3( t2, t3, t0 ); \
|
||||
t2 = v128_shuffle8( t2, _supermix7a ); \
|
||||
t4 = v128_xor3( t4, t1, t2 ); \
|
||||
t2 = v128_shuffle8( t2, _supermix7b ); \
|
||||
t3 = v128_shuffle8( t3, _supermix2a ); \
|
||||
t1 = v128_shuffle8( t0, _supermix4a ); \
|
||||
t0 = v128_shuffle8( t0, _supermix4b ); \
|
||||
t4 = v128_xor3( t4, t2, t1 ); \
|
||||
t0 = v128_xor( t0, t3 ); \
|
||||
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE( r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
_t0 = shuffle_0321( r1c ); \
|
||||
r2c = v128_xor(r2c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r2d = v128_xor(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
_t0 = shuffle_0321( r2c ); \
|
||||
r3c = v128_xor(r3c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r3d = v128_xor(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE( r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
_t0 = shuffle_0321( r3c ); \
|
||||
r4c = v128_xor(r4c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r4d = v128_xor(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE( r4c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
block[2] = col[(base + a + 2) % s];\
|
||||
block[3] = col[(base + a + 3) % s];\
|
||||
x = _mm_load_si128((__m128i*)block)
|
||||
x = v128_load( (v128_t*)block )
|
||||
|
||||
#define STORECOLUMN(x, s)\
|
||||
_mm_store_si128((__m128i*)block, x);\
|
||||
v128_store((v128_t*)block, x );\
|
||||
col[(base + 0) % s] = block[0];\
|
||||
col[(base + 1) % s] = block[1];\
|
||||
col[(base + 2) % s] = block[2];\
|
||||
col[(base + 3) % s] = block[3]
|
||||
|
||||
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
__m128i _t0, _t1, _t2, _t3;
|
||||
v128_t _t0, _t1, _t2, _t3;
|
||||
|
||||
switch(ctx->base)
|
||||
{
|
||||
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
void Final512( hashState_fugue *ctx, uint8_t *hashval )
|
||||
{
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int i, base;
|
||||
__m128i r0, _t0, _t1, _t2, _t3;
|
||||
v128_t r0, _t0, _t1, _t2, _t3;
|
||||
|
||||
for(i = 0; i < 12; i++)
|
||||
for( i = 0; i < 12; i++ )
|
||||
{
|
||||
_mm_store_si128((__m128i*)block, ctx->state[i]);
|
||||
v128_store( (v128_t*)block, ctx->state[i] );
|
||||
|
||||
col[3 * i + 0] = block[0];
|
||||
col[3 * i + 1] = block[1];
|
||||
col[3 * i + 2] = block[2];
|
||||
}
|
||||
|
||||
base = (36 - (12 * ctx->base)) % 36;
|
||||
base = ( 36 - (12 * ctx->base) ) % 36;
|
||||
|
||||
for(i = 0; i < 32; i++)
|
||||
for( i = 0; i < 32; i++ )
|
||||
{
|
||||
// ROR3
|
||||
base = (base + 33) % 36;
|
||||
|
||||
// CMIX
|
||||
col[(base + 0) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 1) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 2) % 36] ^= col[(base + 6) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 20) % 36] ^= col[(base + 6) % 36];
|
||||
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
}
|
||||
|
||||
for(i = 0; i < 13; i++)
|
||||
for( i = 0; i < 13; i++ )
|
||||
{
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 28) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR8
|
||||
base = (base + 28) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
}
|
||||
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
|
||||
LOADCOLUMN(r0, 36, 1);
|
||||
_mm_store_si128((__m128i*)hashval, r0);
|
||||
LOADCOLUMN( r0, 36, 1 );
|
||||
v128_store( (v128_t*)hashval, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
|
||||
LOADCOLUMN(r0, 36, 9);
|
||||
_mm_store_si128((__m128i*)hashval + 1, r0);
|
||||
LOADCOLUMN( r0, 36, 9 );
|
||||
v128_store( (v128_t*)hashval + 1, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
|
||||
LOADCOLUMN(r0, 36, 18);
|
||||
_mm_store_si128((__m128i*)hashval + 2, r0);
|
||||
LOADCOLUMN( r0, 36, 18 );
|
||||
v128_store( (v128_t*)hashval + 2, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
|
||||
LOADCOLUMN(r0, 36, 27);
|
||||
_mm_store_si128((__m128i*)hashval + 3, r0);
|
||||
LOADCOLUMN( r0, 36, 27 );
|
||||
v128_store( (v128_t*)hashval + 3, r0 );
|
||||
}
|
||||
|
||||
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
|
||||
{
|
||||
int i;
|
||||
ctx->processed_bits = 0;
|
||||
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
ctx->uBlockLength = 4;
|
||||
|
||||
for(i = 0; i < 6; i++)
|
||||
ctx->state[i] = m128_zero;
|
||||
ctx->state[i] = v128_zero;
|
||||
|
||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
||||
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
|
||||
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
|
||||
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
|
||||
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
|
||||
ctx->state[6] = casti_v128( _IV512, 0 );
|
||||
ctx->state[7] = casti_v128( _IV512, 1 );
|
||||
ctx->state[8] = casti_v128( _IV512, 2 );
|
||||
ctx->state[9] = casti_v128( _IV512, 3 );
|
||||
ctx->state[10] = casti_v128( _IV512, 4 );
|
||||
ctx->state[11] = casti_v128( _IV512, 5 );
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
|
||||
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||
uint64_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process the buffer
|
||||
Compress512(state, state->buffer, 1);
|
||||
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
||||
int fugue512_Final( hashState_fugue *state, void *hashval )
|
||||
{
|
||||
unsigned int i;
|
||||
BitSequence lengthbuf[8] __attribute__((aligned(64)));
|
||||
uint8_t lengthbuf[8] __attribute__((aligned(64)));
|
||||
|
||||
// Update message bit count
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
||||
// Finalization
|
||||
Final512(state, hashval);
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
|
||||
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||
uint64_t databitlen )
|
||||
{
|
||||
fugue512_Init(hs, 512);
|
||||
fugue512_Update(hs, data, databitlen*8);
|
||||
fugue512_Final(hs, hashval);
|
||||
return SUCCESS;
|
||||
fugue512_Init( hs, 512 );
|
||||
fugue512_Update( hs, data, databitlen*8 );
|
||||
fugue512_Final( hs, hashval );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AES
|
||||
|
@@ -14,37 +14,31 @@
|
||||
#ifndef FUGUE_HASH_API_H
|
||||
#define FUGUE_HASH_API_H
|
||||
|
||||
#if defined(__AES__)
|
||||
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||
#endif
|
||||
|
||||
#include "compat/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i state[12];
|
||||
v128_t state[12];
|
||||
unsigned int base;
|
||||
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
DataLength processed_bits;
|
||||
BitSequence buffer[4];
|
||||
uint64_t processed_bits;
|
||||
uint8_t buffer[4];
|
||||
|
||||
} hashState_fugue __attribute__ ((aligned (64)));
|
||||
|
||||
|
||||
// These functions are deprecated, use the lower case macro aliases that use
|
||||
// the standard interface. This will be cleaned up at a later date.
|
||||
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
|
||||
int fugue512_Init( hashState_fugue *state, int hashbitlen );
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
|
||||
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||
uint64_t databitlen );
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
int fugue512_Final( hashState_fugue *state, void *hashval );
|
||||
|
||||
#define fugue512_init( state ) \
|
||||
fugue512_Init( state, 512 )
|
||||
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
fugue512_Final
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
|
||||
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||
uint64_t databitlen);
|
||||
|
||||
#endif // AES
|
||||
#endif // HASH_API_H
|
||||
|
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
||||
|
||||
static void AddXor512(const void *a,const void *b,void *c)
|
||||
{
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||
casti_m512i( b, 0 ) );
|
||||
#elif defined(__AVX2__)
|
||||
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
|
||||
casti_m256i( b, 0 ) );
|
||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||
casti_m256i( b, 1 ) );
|
||||
#elif defined(__SSE2__)
|
||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
||||
casti_m128i( b, 0 ) );
|
||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
||||
casti_m128i( b, 1 ) );
|
||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
||||
casti_m128i( b, 2 ) );
|
||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
||||
casti_m128i( b, 3 ) );
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
|
||||
casti_v128( b, 0 ) );
|
||||
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
|
||||
casti_v128( b, 1 ) );
|
||||
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
|
||||
casti_v128( b, 2 ) );
|
||||
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
|
||||
casti_v128( b, 3 ) );
|
||||
#else
|
||||
const unsigned long long *A=a, *B=b;
|
||||
unsigned long long *C=c;
|
||||
|
@@ -60,18 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// No fast shuffle on NEON
|
||||
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
|
||||
|
||||
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
|
||||
|
||||
#else
|
||||
|
||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
||||
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
@@ -104,7 +103,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
@@ -298,17 +297,16 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
*/
|
||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = v128_xor(b0, b0);\
|
||||
a0 = v128_aesenclast(a0, b0);\
|
||||
a1 = v128_aesenclast(a1, b0);\
|
||||
a2 = v128_aesenclast(a2, b0);\
|
||||
a3 = v128_aesenclast(a3, b0);\
|
||||
a4 = v128_aesenclast(a4, b0);\
|
||||
a5 = v128_aesenclast(a5, b0);\
|
||||
a6 = v128_aesenclast(a6, b0);\
|
||||
a7 = v128_aesenclast(a7, b0);\
|
||||
a0 = v128_aesenclast_nokey( a0 ); \
|
||||
a1 = v128_aesenclast_nokey( a1 ); \
|
||||
a2 = v128_aesenclast_nokey( a2 ); \
|
||||
a3 = v128_aesenclast_nokey( a3 ); \
|
||||
a4 = v128_aesenclast_nokey( a4 ); \
|
||||
a5 = v128_aesenclast_nokey( a5 ); \
|
||||
a6 = v128_aesenclast_nokey( a6 ); \
|
||||
a7 = v128_aesenclast_nokey( a7 ); \
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
|
||||
}
|
||||
|
||||
#define ROUNDS_P(){\
|
||||
@@ -326,10 +324,9 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
|
||||
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
|
||||
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = v128_xor( xmm0, \
|
||||
casti_v128( round_const_p, round_counter+1 ) ); \
|
||||
@@ -431,7 +428,6 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
t1 = v128_unpackhi16(t1, i3);\
|
||||
i2 = v128_unpacklo16(i2, i3);\
|
||||
i0 = v128_unpacklo16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = gr_shuffle32( t0 ); \
|
||||
t1 = gr_shuffle32( t1 ); \
|
||||
@@ -441,7 +437,6 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
i2 = gr_shuffle32( i2 ); \
|
||||
i4 = gr_shuffle32( i4 ); \
|
||||
i6 = gr_shuffle32( i6 ); \
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = v128_unpacklo32(i0, i2);\
|
||||
@@ -548,7 +543,8 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
|
||||
#if 0
|
||||
// not used
|
||||
void INIT( v128_t* chaining )
|
||||
{
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
@@ -577,6 +573,7 @@ void INIT( v128_t* chaining )
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
#endif
|
||||
|
||||
void TF1024( v128_t* chaining, const v128_t* message )
|
||||
{
|
||||
|
@@ -1,3 +1,6 @@
|
||||
#if !defined GROESTL256_INTR_AES_H__
|
||||
#define GROESTL256_INTR_AES_H__
|
||||
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// No fast shuffle on NEON
|
||||
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
|
||||
|
||||
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
|
||||
|
||||
#else
|
||||
|
||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
||||
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
v128_t* in = (v128_t*)input;
|
||||
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
|
||||
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
|
||||
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
|
||||
#define groestl512_full groestl512
|
||||
#define groestl512_ctx groestl512
|
||||
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#define GROESTL_4WAY_VAES 1
|
||||
#endif
|
||||
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
|
||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
|
@@ -43,7 +43,7 @@
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||
|
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
||||
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
|
||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
|
@@ -17,7 +17,7 @@
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
|
@@ -33,7 +33,7 @@
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
|
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
||||
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
||||
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
||||
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
||||
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
||||
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
||||
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
||||
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
||||
|
@@ -17,7 +17,7 @@ typedef struct {
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
sha256_8way_context sha;
|
||||
sha256_8x32_context sha;
|
||||
} myrgr_8way_ctx_holder;
|
||||
|
||||
myrgr_8way_ctx_holder myrgr_8way_ctx;
|
||||
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
|
||||
#else
|
||||
init_groestl( &myrgr_8way_ctx.groestl, 64 );
|
||||
#endif
|
||||
sha256_8way_init( &myrgr_8way_ctx.sha );
|
||||
sha256_8x32_init( &myrgr_8way_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_8way_hash( void *output, const void *input )
|
||||
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
|
||||
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||
hash6, hash7 );
|
||||
|
||||
sha256_8way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8way_close( &ctx.sha, output );
|
||||
sha256_8x32_update( &ctx.sha, vhash, 64 );
|
||||
sha256_8x32_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
typedef struct {
|
||||
hashState_groestl groestl;
|
||||
sha256_4way_context sha;
|
||||
sha256_4x32_context sha;
|
||||
} myrgr_4way_ctx_holder;
|
||||
|
||||
myrgr_4way_ctx_holder myrgr_4way_ctx;
|
||||
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
|
||||
void init_myrgr_4way_ctx()
|
||||
{
|
||||
init_groestl (&myrgr_4way_ctx.groestl, 64 );
|
||||
sha256_4way_init( &myrgr_4way_ctx.sha );
|
||||
sha256_4x32_init( &myrgr_4way_ctx.sha );
|
||||
}
|
||||
|
||||
void myriad_4way_hash( void *output, const void *input )
|
||||
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
sha256_4way_update( &ctx.sha, vhash, 64 );
|
||||
sha256_4way_close( &ctx.sha, output );
|
||||
sha256_4x32_update( &ctx.sha, vhash, 64 );
|
||||
sha256_4x32_close( &ctx.sha, output );
|
||||
}
|
||||
|
||||
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||
|
||||
myriad_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
|
||||
init_myrgr_ctx();
|
||||
gate->scanhash = (void*)&scanhash_myriad;
|
||||
gate->hash = (void*)&myriad_hash;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#define MYRGR_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||
#define MYRGR_4WAY 1
|
||||
|
@@ -35,8 +35,6 @@
|
||||
|
||||
#include "sph_groestl.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -42,7 +42,6 @@ extern "C"{
|
||||
#include <stddef.h>
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
/**
|
||||
* Output size (in bits) for Groestl-224.
|
||||
*/
|
||||
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -35,7 +35,7 @@
|
||||
#include <stdio.h>
|
||||
#include "hamsi-hash-4way.h"
|
||||
|
||||
static const uint32_t HAMSI_IV512[] =
|
||||
static const uint32_t HAMSI_IV512[] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
|
||||
0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
|
||||
@@ -43,7 +43,8 @@ static const uint32_t HAMSI_IV512[] =
|
||||
0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
|
||||
};
|
||||
|
||||
static const uint32_t alpha_n[] = {
|
||||
static const uint32_t alpha_n[] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
|
||||
0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
|
||||
0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
|
||||
@@ -54,7 +55,8 @@ static const uint32_t alpha_n[] = {
|
||||
0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
|
||||
};
|
||||
|
||||
static const uint32_t alpha_f[] = {
|
||||
static const uint32_t alpha_f[] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0,
|
||||
0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
|
||||
0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
|
||||
@@ -69,7 +71,8 @@ static const uint32_t alpha_f[] = {
|
||||
|
||||
/* Note: this table lists bits within each byte from least
|
||||
siginificant to most significant. */
|
||||
static const uint32_t T512[64][16] = {
|
||||
static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
|
||||
0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
|
||||
0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
|
||||
@@ -379,12 +382,12 @@ static const uint32_t T512[64][16] = {
|
||||
#define S1F MF
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
||||
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
||||
// had a 3% faster overall hashrate than than using movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
@@ -415,13 +418,11 @@ static const uint32_t T512[64][16] = {
|
||||
tb = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
b = mm512_xoror( td, tb, a ); \
|
||||
td = mm512_xorand( a, td, tb ); \
|
||||
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = mm512_xor3( tb, b, td ); \
|
||||
d = mm512_not( td ); \
|
||||
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
@@ -1058,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||
void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
|
||||
{
|
||||
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
|
||||
@@ -1070,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||
WRITE_STATE_BIG8( sc );
|
||||
}
|
||||
|
||||
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
void hamsi512_8x64_init( hamsi512_8x64_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
@@ -1086,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||
sc->h[7] = v512_64( iv[7] );
|
||||
}
|
||||
|
||||
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
@@ -1098,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||
sc->partial_len = len;
|
||||
}
|
||||
|
||||
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
|
||||
{
|
||||
__m512i pad[1];
|
||||
uint32_t ch, cl;
|
||||
@@ -1119,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
|
||||
// Hamsi 4 way AVX2
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
@@ -1152,11 +1153,99 @@ do { \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define INPUT_BIG_sub( db_i ) \
|
||||
{ \
|
||||
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_BIG \
|
||||
{ \
|
||||
const __m256i db = *buf; \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
||||
INPUT_BIG_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// dependent on the compiler unrolling the loop
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
@@ -1177,6 +1266,7 @@ do { \
|
||||
tp += 8; \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
@@ -1216,7 +1306,7 @@ do { \
|
||||
do { \
|
||||
a = mm256_rol_32( a, 13 ); \
|
||||
c = mm256_rol_32( c, 3 ); \
|
||||
b = mm256_xor3( a, b, c ); \
|
||||
b = mm256_xor3( b, a, c ); \
|
||||
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
||||
b = mm256_rol_32( b, 1 ); \
|
||||
d = mm256_rol_32( d, 7 ); \
|
||||
@@ -1498,7 +1588,7 @@ do { /* order is important */ \
|
||||
sc->h[14] = CE; \
|
||||
sc->h[15] = CF;
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define INPUT_8X32 \
|
||||
{ \
|
||||
@@ -1854,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,
|
||||
|
||||
////////////
|
||||
|
||||
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
||||
void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
|
||||
{
|
||||
DECL_STATE_BIG
|
||||
uint32_t tmp;
|
||||
@@ -1878,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
|
||||
WRITE_STATE_BIG( sc );
|
||||
}
|
||||
|
||||
void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
||||
void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
|
||||
{
|
||||
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
DECL_STATE_BIG
|
||||
@@ -1889,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
|
||||
WRITE_STATE_BIG( sc );
|
||||
}
|
||||
|
||||
void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
void hamsi512_4x64_init( hamsi512_4x64_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
@@ -1904,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
sc->h[7] = v256_64( iv[7] );
|
||||
}
|
||||
|
||||
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
@@ -1916,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
sc->partial_len = len;
|
||||
}
|
||||
|
||||
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
|
||||
{
|
||||
__m256i pad[1];
|
||||
uint32_t ch, cl;
|
||||
@@ -1936,7 +2026,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
#define DECL_STATE_2x64 \
|
||||
v128_t c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||
v128u64_t c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||
|
||||
#define READ_STATE_2x64(sc) \
|
||||
c0 = sc->h[0]; \
|
||||
@@ -1958,15 +2048,103 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7;
|
||||
|
||||
#define INPUT_2x64_sub( db_i ) \
|
||||
{ \
|
||||
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128_t db = *buf; \
|
||||
const v128_t zero = v128_zero; \
|
||||
const v128u64_t db = *buf; \
|
||||
const v128u64_t zero = v128_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
||||
INPUT_2x64_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Dependent on the compiler unrolling the loop.
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128u64_t db = *buf; \
|
||||
const v128u64_t zero = v128_64( 0ull ); \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int i = 63; i >= 0; i-- ) \
|
||||
{ \
|
||||
v128_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
|
||||
v128u64_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
@@ -1978,11 +2156,12 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX_2x64( a, b, c, d ) \
|
||||
{ \
|
||||
v128_t tb, td; \
|
||||
v128u64_t tb, td; \
|
||||
td = v128_xorand( d, a, c ); \
|
||||
tb = v128_xoror( b, d, a ); \
|
||||
c = v128_xor3( c, td, b ); \
|
||||
@@ -1998,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{ \
|
||||
a = v128_rol32( a, 13 ); \
|
||||
c = v128_rol32( c, 3 ); \
|
||||
b = v128_xor3( a, b, c ); \
|
||||
b = v128_xor3( c, a, b ); \
|
||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||
b = v128_rol32( b, 1 ); \
|
||||
d = v128_rol32( d, 7 ); \
|
||||
@@ -2010,7 +2189,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
|
||||
#define ROUND_2x64( alpha ) \
|
||||
{ \
|
||||
v128_t t0, t1, t2, t3, t4, t5; \
|
||||
v128u64_t t0, t1, t2, t3, t4, t5; \
|
||||
const v128_t mask = v128_64( 0x00000000ffffffff ); \
|
||||
s0 = v128_xor( s0, alpha[ 0] ); \
|
||||
s1 = v128_xor( s1, alpha[ 1] ); \
|
||||
@@ -2107,7 +2286,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
|
||||
#define P_2x64 \
|
||||
{ \
|
||||
v128_t alpha[16]; \
|
||||
v128u64_t alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
@@ -2126,7 +2305,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
|
||||
#define PF_2x64 \
|
||||
{ \
|
||||
v128_t alpha[16]; \
|
||||
v128u64_t alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
@@ -2193,7 +2372,7 @@ void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
|
||||
|
||||
void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
|
||||
{
|
||||
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
v128u64_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
DECL_STATE_2x64;
|
||||
READ_STATE_2x64( sc );
|
||||
INPUT_2x64;
|
||||
@@ -2231,15 +2410,15 @@ void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
|
||||
|
||||
void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
|
||||
{
|
||||
v128_t pad[1];
|
||||
v128u32_t pad;
|
||||
uint32_t ch, cl;
|
||||
|
||||
ch = bswap_32( sc->count_high );
|
||||
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||
pad = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||
sc->buf[0] = v128_64( 0x80 );
|
||||
hamsi64_big( sc, sc->buf, 1 );
|
||||
hamsi64_big_final( sc, pad );
|
||||
hamsi64_big_final( sc, &pad );
|
||||
|
||||
v128_block_bswap32( (v128_t*)dst, sc->h );
|
||||
}
|
||||
@@ -2260,4 +2439,4 @@ void hamsi512_2x64( void *dst, const void *data, size_t len )
|
||||
hamsi512_2x64_close( &sc, dst );
|
||||
}
|
||||
|
||||
#endif // SSE4.1 or NEON
|
||||
#endif // SSE4.2 or NEON
|
||||
|
@@ -38,7 +38,7 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// SSE2 or NEON Hamsi-512 2x64
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Hamsi-512 4x64
|
||||
@@ -70,17 +72,17 @@ typedef struct
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_4way_big_context;
|
||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
typedef hamsi_4way_big_context hamsi512_4x64_context;
|
||||
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||
void hamsi512_4x64_init( hamsi512_4x64_context *sc );
|
||||
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_4x64_context hamsi512_4way_context
|
||||
#define hamsi512_4x64_init hamsi512_4way_init
|
||||
#define hamsi512_4x64_update hamsi512_4way_update
|
||||
#define hamsi512_4x64_close hamsi512_4way_close
|
||||
#define hamsi512_4way_context hamsi512_4x64_context
|
||||
#define hamsi512_4way_init hamsi512_4x64_init
|
||||
#define hamsi512_4way_update hamsi512_4x64_update
|
||||
#define hamsi512_4way_close hamsi512_4x64_close
|
||||
|
||||
// Hamsi-512 8x32
|
||||
|
||||
@@ -102,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// Hamsi-512 8x64
|
||||
|
||||
@@ -113,17 +115,17 @@ typedef struct
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_8way_big_context;
|
||||
typedef hamsi_8way_big_context hamsi512_8way_context;
|
||||
typedef hamsi_8way_big_context hamsi512_8x64_context;
|
||||
|
||||
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
||||
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
||||
void hamsi512_8x64_init( hamsi512_8x64_context *sc );
|
||||
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
||||
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_8x64_context hamsi512_8way_context
|
||||
#define hamsi512_8x64_init hamsi512_8way_init
|
||||
#define hamsi512_8x64_update hamsi512_8way_update
|
||||
#define hamsi512_8x64_close hamsi512_8way_close
|
||||
#define hamsi512_8way_context hamsi512_8x64_context
|
||||
#define hamsi512_8way_init hamsi512_8x64_init
|
||||
#define hamsi512_8way_update hamsi512_8x64_update
|
||||
#define hamsi512_8way_close hamsi512_8x64_close
|
||||
|
||||
// Hamsi-512 16x32
|
||||
|
||||
|
@@ -53,7 +53,7 @@ extern "C"{
|
||||
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
||||
//#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define v128_andnotxor( a, b, c ) \
|
||||
@@ -583,7 +583,7 @@ do { \
|
||||
|
||||
// Haval-256 8 way 32 bit avx2
|
||||
|
||||
#if defined (__AVX512VL__)
|
||||
#if defined (VL256)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm256_andnotxor( a, b, c ) \
|
||||
@@ -882,7 +882,7 @@ do { \
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm512_andnotxor( a, b, c ) \
|
||||
|
@@ -82,12 +82,15 @@ typedef struct {
|
||||
typedef haval_4way_context haval256_5_4way_context;
|
||||
|
||||
void haval256_5_4way_init( void *cc );
|
||||
|
||||
void haval256_5_4way_update( void *cc, const void *data, size_t len );
|
||||
//#define haval256_5_4way haval256_5_4way_update
|
||||
|
||||
void haval256_5_4way_close( void *cc, void *dst );
|
||||
|
||||
#define haval256_4x32_context haval256_5_4way_context
|
||||
#define haval256_4x32_init haval256_5_4way_init
|
||||
#define haval256_4x32_update haval256_5_4way_update
|
||||
#define haval256_4x32_close haval256_5_4way_close
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
@@ -100,14 +103,17 @@ typedef struct {
|
||||
typedef haval_8way_context haval256_5_8way_context;
|
||||
|
||||
void haval256_5_8way_init( void *cc );
|
||||
|
||||
void haval256_5_8way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#define haval256_8x32_context haval256_5_8way_context
|
||||
#define haval256_8x32_init haval256_5_8way_init
|
||||
#define haval256_8x32_update haval256_5_8way_update
|
||||
#define haval256_8x32_close haval256_5_8way_close
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[32];
|
||||
@@ -119,11 +125,14 @@ typedef struct {
|
||||
typedef haval_16way_context haval256_5_16way_context;
|
||||
|
||||
void haval256_5_16way_init( void *cc );
|
||||
|
||||
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_16way_close( void *cc, void *dst );
|
||||
|
||||
#define haval256_16x32_context haval256_5_16way_context
|
||||
#define haval256_16x32_init haval256_5_16way_init
|
||||
#define haval256_16x32_update haval256_5_16way_update
|
||||
#define haval256_16x32_close haval256_5_16way_close
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
183
algo/hodl/aes.c
183
algo/hodl/aes.c
@@ -1,183 +0,0 @@
|
||||
#include <stdint.h>
|
||||
#include "miner.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include "wolf-aes.h"
|
||||
|
||||
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
|
||||
{
|
||||
__m128i tmp4;
|
||||
*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
|
||||
tmp4 = _mm_slli_si128(*tmp1, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
|
||||
*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
|
||||
}
|
||||
|
||||
static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
|
||||
{
|
||||
__m128i tmp2, tmp4;
|
||||
|
||||
tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
|
||||
tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
|
||||
tmp4 = _mm_slli_si128(*tmp3, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
tmp4 = _mm_slli_si128(tmp4, 0x04);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
|
||||
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
|
||||
}
|
||||
|
||||
// Special thanks to Intel for helping me
|
||||
// with ExpandAESKey256() and its subroutines
|
||||
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
|
||||
{
|
||||
__m128i tmp1, tmp2, tmp3;
|
||||
|
||||
tmp1 = keys[0] = KeyBuf[0];
|
||||
tmp3 = keys[1] = KeyBuf[1];
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[2] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[3] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[4] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[5] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[6] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[7] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[8] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[9] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[10] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[11] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[12] = tmp1;
|
||||
ExpandAESKey256_sub2(&tmp1, &tmp3);
|
||||
keys[13] = tmp3;
|
||||
|
||||
tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
|
||||
ExpandAESKey256_sub1(&tmp1, &tmp2);
|
||||
keys[14] = tmp1;
|
||||
}
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
#define AESENC(i,j) \
|
||||
State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
|
||||
|
||||
#define AESENC_N(i) \
|
||||
AESENC(i,0) \
|
||||
AESENC(i,1) \
|
||||
AESENC(i,2) \
|
||||
AESENC(i,3) \
|
||||
AESENC(i,4) \
|
||||
AESENC(i,5) \
|
||||
AESENC(i,6) \
|
||||
AESENC(i,7) \
|
||||
|
||||
|
||||
static inline void AES256Core(__m128i* State, __m128i ExpandedKey[][16])
|
||||
{
|
||||
const uint32_t N = AES_PARALLEL_N;
|
||||
|
||||
for(int j=0; j<N; ++j) {
|
||||
State[j] = _mm_xor_si128(State[j], ExpandedKey[j][0]);
|
||||
}
|
||||
|
||||
AESENC_N(1)
|
||||
AESENC_N(2)
|
||||
AESENC_N(3)
|
||||
AESENC_N(4)
|
||||
AESENC_N(5)
|
||||
AESENC_N(6)
|
||||
AESENC_N(7)
|
||||
AESENC_N(8)
|
||||
AESENC_N(9)
|
||||
AESENC_N(10)
|
||||
AESENC_N(11)
|
||||
AESENC_N(12)
|
||||
AESENC_N(13)
|
||||
|
||||
for(int j=0; j<N; ++j) {
|
||||
State[j] = _mm_aesenclast_si128(State[j], ExpandedKey[j][14]);
|
||||
}
|
||||
}
|
||||
|
||||
void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16], __m128i* IV)
|
||||
{
|
||||
const uint32_t N = AES_PARALLEL_N;
|
||||
__m128i State[N];
|
||||
for(int j=0; j<N; ++j) {
|
||||
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][0], next[j][0]), IV[j]);
|
||||
}
|
||||
|
||||
AES256Core(State, ExpandedKey);
|
||||
for(int j=0; j<N; ++j) {
|
||||
data[j][0] = State[j];
|
||||
}
|
||||
|
||||
for(int i = 1; i < BLOCK_COUNT; ++i) {
|
||||
for(int j=0; j<N; ++j) {
|
||||
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][i], next[j][i]), data[j][i - 1]);
|
||||
}
|
||||
AES256Core(State, ExpandedKey);
|
||||
for(int j=0; j<N; ++j) {
|
||||
data[j][i] = State[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else // NO AVX
|
||||
|
||||
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
|
||||
{
|
||||
State = _mm_xor_si128(State, ExpandedKey[0]);
|
||||
|
||||
for(int i = 1; i < 14; ++i) State = _mm_aesenc_si128(State, ExpandedKey[i]);
|
||||
|
||||
return(_mm_aesenclast_si128(State, ExpandedKey[14]));
|
||||
}
|
||||
|
||||
void AES256CBC(__m128i *Ciphertext, const __m128i *Plaintext, const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount)
|
||||
{
|
||||
__m128i State = _mm_xor_si128(Plaintext[0], IV);
|
||||
State = AES256Core(State, ExpandedKey);
|
||||
Ciphertext[0] = State;
|
||||
|
||||
for(int i = 1; i < BlockCount; ++i)
|
||||
{
|
||||
State = _mm_xor_si128(Plaintext[i], Ciphertext[i - 1]);
|
||||
State = AES256Core(State, ExpandedKey);
|
||||
Ciphertext[i] = State;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,75 +0,0 @@
|
||||
#ifndef HODL_BYTESWAP_H
|
||||
#define HODL_BYTESWAP_H 1
|
||||
|
||||
#define __bswap_constant_16(x) \
|
||||
((unsigned short int) ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)))
|
||||
|
||||
static __inline unsigned short int
|
||||
__bswap_16 (unsigned short int __bsx)
|
||||
{
|
||||
return __bswap_constant_16 (__bsx);
|
||||
}
|
||||
|
||||
// LE
|
||||
# define htobe16(x) __bswap_16 (x)
|
||||
# define htole16(x) (x)
|
||||
# define be16toh(x) __bswap_16 (x)
|
||||
# define le16toh(x) (x)
|
||||
|
||||
// BE
|
||||
//# define htole16(x) __bswap_16 (x)
|
||||
//# define htobe16(x) (x)
|
||||
//# define le16toh(x) __bswap_16 (x)
|
||||
//# define be16toh(x) (x)
|
||||
|
||||
#define __bswap_constant_32(x) \
|
||||
((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \
|
||||
(((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
|
||||
|
||||
static __inline unsigned int
|
||||
__bswap_32 (unsigned int __bsx)
|
||||
{
|
||||
return __builtin_bswap32 (__bsx);
|
||||
}
|
||||
|
||||
// LE
|
||||
# define htobe32(x) __bswap_32 (x)
|
||||
# define htole32(x) (x)
|
||||
# define be32toh(x) __bswap_32 (x)
|
||||
# define le32toh(x) (x)
|
||||
|
||||
// BE
|
||||
//# define htole32(x) __bswap_32 (x)
|
||||
//# define htobe32(x) (x)
|
||||
//# define le32toh(x) __bswap_32 (x)
|
||||
//# define be32toh(x) (x)
|
||||
|
||||
# define __bswap_constant_64(x) \
|
||||
((((x) & 0xff00000000000000ull) >> 56) \
|
||||
| (((x) & 0x00ff000000000000ull) >> 40) \
|
||||
| (((x) & 0x0000ff0000000000ull) >> 24) \
|
||||
| (((x) & 0x000000ff00000000ull) >> 8) \
|
||||
| (((x) & 0x00000000ff000000ull) << 8) \
|
||||
| (((x) & 0x0000000000ff0000ull) << 24) \
|
||||
| (((x) & 0x000000000000ff00ull) << 40) \
|
||||
| (((x) & 0x00000000000000ffull) << 56))
|
||||
|
||||
static __inline uint64_t
|
||||
__bswap_64 (uint64_t __bsx)
|
||||
{
|
||||
return __bswap_constant_64 (__bsx);
|
||||
}
|
||||
|
||||
// LE
|
||||
# define htobe64(x) __bswap_64 (x)
|
||||
# define htole64(x) (x)
|
||||
# define be64toh(x) __bswap_64 (x)
|
||||
# define le64toh(x) (x)
|
||||
|
||||
// BE
|
||||
//# define htole64(x) __bswap_64 (x)
|
||||
//# define htobe64(x) (x)
|
||||
//# define le64toh(x) __bswap_64 (x)
|
||||
//# define be64toh(x) (x)
|
||||
|
||||
#endif
|
@@ -1,185 +0,0 @@
|
||||
#include <memory.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hodl-gate.h"
|
||||
#include "hodl-wolf.h"
|
||||
|
||||
#define HODL_NSTARTLOC_INDEX 20
|
||||
#define HODL_NFINALCALC_INDEX 21
|
||||
|
||||
static struct work hodl_work;
|
||||
|
||||
pthread_barrier_t hodl_barrier;
|
||||
|
||||
// All references to this buffer are local to this file, so no args
|
||||
// need to be passed.
|
||||
unsigned char *hodl_scratchbuf = NULL;
|
||||
|
||||
void hodl_le_build_stratum_request( char* req, struct work* work,
|
||||
struct stratum_ctx *sctx )
|
||||
{
|
||||
uint32_t ntime, nonce, nstartloc, nfinalcalc;
|
||||
char ntimestr[9], noncestr[9], nstartlocstr[9], nfinalcalcstr[9];
|
||||
unsigned char *xnonce2str;
|
||||
|
||||
le32enc( &ntime, work->data[ algo_gate.ntime_index ] );
|
||||
le32enc( &nonce, work->data[ algo_gate.nonce_index ] );
|
||||
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
|
||||
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
|
||||
xnonce2str = abin2hex(work->xnonce2, work->xnonce2_len );
|
||||
le32enc( &nstartloc, work->data[ HODL_NSTARTLOC_INDEX ] );
|
||||
le32enc( &nfinalcalc, work->data[ HODL_NFINALCALC_INDEX ] );
|
||||
bin2hex( nstartlocstr, (char*)(&nstartloc), sizeof(uint32_t) );
|
||||
bin2hex( nfinalcalcstr, (char*)(&nfinalcalc), sizeof(uint32_t) );
|
||||
sprintf( req, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
|
||||
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr,
|
||||
nstartlocstr, nfinalcalcstr );
|
||||
free( xnonce2str );
|
||||
}
|
||||
|
||||
char* hodl_malloc_txs_request( struct work *work )
|
||||
{
|
||||
char* req;
|
||||
json_t *val;
|
||||
char data_str[2 * sizeof(work->data) + 1];
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
|
||||
be32enc( work->data + i, work->data[i] );
|
||||
|
||||
bin2hex( data_str, (unsigned char *)work->data, 88 );
|
||||
if ( work->workid )
|
||||
{
|
||||
char *params;
|
||||
val = json_object();
|
||||
json_object_set_new( val, "workid", json_string( work->workid ) );
|
||||
params = json_dumps( val, 0 );
|
||||
json_decref( val );
|
||||
req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
|
||||
sprintf( req,
|
||||
"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
|
||||
data_str, work->txs, params);
|
||||
free( params );
|
||||
}
|
||||
else
|
||||
{
|
||||
req = malloc( 128 + 2*88 + strlen(work->txs));
|
||||
sprintf( req,
|
||||
"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
|
||||
data_str, work->txs);
|
||||
}
|
||||
return req;
|
||||
}
|
||||
|
||||
void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_tree,
|
||||
uint32_t ntime, uint32_t nbits )
|
||||
{
|
||||
int i;
|
||||
|
||||
memset( g_work->data, 0, sizeof(g_work->data) );
|
||||
g_work->data[0] = version;
|
||||
|
||||
if ( have_stratum )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[ 1+i ] = le32dec( prevhash + i );
|
||||
else
|
||||
for (i = 0; i < 8; i++)
|
||||
g_work->data[ 8-i ] = le32dec( prevhash + i );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[ 9+i ] = be32dec( merkle_tree + i );
|
||||
|
||||
g_work->data[ algo_gate.ntime_index ] = ntime;
|
||||
g_work->data[ algo_gate.nbits_index ] = nbits;
|
||||
g_work->data[22] = 0x80000000;
|
||||
g_work->data[31] = 0x00000280;
|
||||
}
|
||||
|
||||
// called only by thread 0, saves a backup of g_work
|
||||
void hodl_get_new_work( struct work* work, struct work* g_work)
|
||||
{
|
||||
// pthread_rwlock_rdlock( &g_work_lock );
|
||||
|
||||
work_free( &hodl_work );
|
||||
work_copy( &hodl_work, g_work );
|
||||
hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
|
||||
|
||||
// pthread_rwlock_unlock( &g_work_lock );
|
||||
}
|
||||
|
||||
json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
|
||||
{
|
||||
json_t *val;
|
||||
char *req = NULL;
|
||||
|
||||
if ( have_gbt )
|
||||
{
|
||||
req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
|
||||
sprintf( req, gbt_lp_req, lp_id );
|
||||
}
|
||||
val = json_rpc_call( curl, lp_url, rpc_userpass,
|
||||
req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
|
||||
free( req );
|
||||
return val;
|
||||
}
|
||||
|
||||
// called by every thread, copies the backup to each thread's work.
|
||||
void hodl_resync_threads( int thr_id, struct work* work )
|
||||
{
|
||||
int nonce_index = algo_gate.nonce_index;
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
if ( memcmp( work->data, hodl_work.data, algo_gate.work_cmp_size ) )
|
||||
{
|
||||
work_free( work );
|
||||
work_copy( work, &hodl_work );
|
||||
}
|
||||
work->data[ nonce_index ] = swab32( hodl_work.data[ nonce_index ] );
|
||||
work_restart[thr_id].restart = 0;
|
||||
}
|
||||
|
||||
bool hodl_do_this_thread( int thr_id )
|
||||
{
|
||||
return ( thr_id == 0 );
|
||||
}
|
||||
|
||||
int hodl_scanhash( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
#if defined(__AES__)
|
||||
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
|
||||
pthread_barrier_wait( &hodl_barrier );
|
||||
return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
bool register_hodl_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if !defined(__AES__)
|
||||
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
|
||||
return false;
|
||||
#endif
|
||||
|
||||
if ( GARBAGE_SIZE % opt_n_threads )
|
||||
applog( LOG_WARNING,"WARNING: Thread count must be power of 2. Miner may crash or produce invalid hash!" );
|
||||
|
||||
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
|
||||
gate->optimizations = SSE42_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&hodl_scanhash;
|
||||
gate->get_new_work = (void*)&hodl_get_new_work;
|
||||
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
|
||||
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
|
||||
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
|
||||
gate->build_block_header = (void*)&hodl_build_block_header;
|
||||
gate->resync_threads = (void*)&hodl_resync_threads;
|
||||
gate->do_this_thread = (void*)&hodl_do_this_thread;
|
||||
gate->work_cmp_size = 76;
|
||||
hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
|
||||
allow_getwork = false;
|
||||
opt_target_factor = 8388608.0;
|
||||
return ( hodl_scratchbuf != NULL );
|
||||
}
|
||||
|
||||
|
@@ -1,6 +0,0 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
extern unsigned char *hodl_scratchbuf;
|
||||
|
||||
bool register_hodl_algo ( algo_gate_t* gate );
|
||||
|
@@ -1,225 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <openssl/evp.h>
|
||||
#include <openssl/sha.h>
|
||||
#include "simd-utils.h"
|
||||
#include "sha512-avx.h"
|
||||
#include "wolf-aes.h"
|
||||
#include "hodl-gate.h"
|
||||
#include "hodl-wolf.h"
|
||||
#include "miner.h"
|
||||
#include "algo/sha/sha256d.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
void *MidHash )
|
||||
{
|
||||
const int Chunk = TOTAL_CHUNKS / ThreadCount;
|
||||
const uint32_t StartChunk = ThreadID * Chunk;
|
||||
const uint32_t EndChunk = StartChunk + Chunk;
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
|
||||
uint64_t* desination[ SHA512_PARALLEL_N ];
|
||||
|
||||
for ( int i=0; i < SHA512_PARALLEL_N; ++i )
|
||||
{
|
||||
TempBufs[i] = (uint64_t*)malloc( 32 );
|
||||
memcpy( TempBufs[i], MidHash, 32 );
|
||||
}
|
||||
|
||||
for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
|
||||
{
|
||||
for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
|
||||
{
|
||||
( (uint32_t*)TempBufs[j] )[0] = i + j;
|
||||
desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
|
||||
* GARBAGE_CHUNK_SIZE ) );
|
||||
}
|
||||
sha512Compute32b_parallel( TempBufs, desination );
|
||||
}
|
||||
|
||||
for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
|
||||
free( TempBufs[i] );
|
||||
#else
|
||||
uint32_t TempBuf[8];
|
||||
memcpy( TempBuf, MidHash, 32 );
|
||||
|
||||
for ( uint32_t i = StartChunk; i < EndChunk; ++i )
|
||||
{
|
||||
TempBuf[0] = i;
|
||||
SHA512( ( uint8_t *)TempBuf, 32,
|
||||
( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
void Rev256(uint32_t *Dest, const uint32_t *Src)
|
||||
{
|
||||
for(int i = 0; i < 8; ++i) Dest[i] = swab32(Src[i]);
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int threadNumber = mythr->id;
|
||||
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
|
||||
CacheEntry Cache[AES_PARALLEL_N] __attribute__ ((aligned (64)));
|
||||
__m128i* data[AES_PARALLEL_N];
|
||||
const __m128i* next[AES_PARALLEL_N];
|
||||
uint32_t CollisionCount = 0;
|
||||
|
||||
for ( int n=0; n<AES_PARALLEL_N; ++n )
|
||||
{
|
||||
data[n] = Cache[n].dqwords;
|
||||
}
|
||||
|
||||
// Search for pattern in psuedorandom data
|
||||
int searchNumber = COMPARE_SIZE / opt_n_threads;
|
||||
int startLoc = threadNumber * searchNumber;
|
||||
|
||||
for ( int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k += AES_PARALLEL_N )
|
||||
{
|
||||
// copy data to first l2 cache
|
||||
for ( int n=0; n<AES_PARALLEL_N; ++n )
|
||||
{
|
||||
memcpy(Cache[n].dwords, Garbage + k + n, GARBAGE_SLICE_SIZE);
|
||||
}
|
||||
|
||||
for(int j = 0; j < AES_ITERATIONS; ++j)
|
||||
{
|
||||
__m128i ExpKey[AES_PARALLEL_N][16];
|
||||
__m128i ivs[AES_PARALLEL_N];
|
||||
|
||||
// use last 4 bytes of first cache as next location
|
||||
for(int n=0; n<AES_PARALLEL_N; ++n) {
|
||||
uint32_t nextLocation = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
|
||||
next[n] = Garbage[nextLocation].dqwords;
|
||||
|
||||
__m128i last[2];
|
||||
last[0] = _mm_xor_si128(Cache[n].dqwords[254], next[n][254]);
|
||||
last[1] = _mm_xor_si128(Cache[n].dqwords[255], next[n][255]);
|
||||
|
||||
// Key is last 32b of Cache
|
||||
// IV is last 16b of Cache
|
||||
ExpandAESKey256(ExpKey[n], last);
|
||||
ivs[n] = last[1];
|
||||
}
|
||||
AES256CBC(data, next, ExpKey, ivs);
|
||||
}
|
||||
|
||||
for(int n=0; n<AES_PARALLEL_N; ++n)
|
||||
if((Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 1] & (COMPARE_SIZE - 1)) < 1000)
|
||||
{
|
||||
uint32_t BlockHdr[22], FinalPoW[8];
|
||||
|
||||
swab32_array( BlockHdr, pdata, 20 );
|
||||
|
||||
BlockHdr[20] = k + n;
|
||||
BlockHdr[21] = Cache[n].dwords[(GARBAGE_SLICE_SIZE >> 2) - 2];
|
||||
|
||||
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
|
||||
CollisionCount++;
|
||||
if( FinalPoW[7] <= ptarget[7] )
|
||||
{
|
||||
pdata[20] = swab32( BlockHdr[20] );
|
||||
pdata[21] = swab32( BlockHdr[21] );
|
||||
*hashes_done = CollisionCount;
|
||||
submit_solution( work, FinalPoW, mythr );
|
||||
return(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = CollisionCount;
|
||||
return(0);
|
||||
|
||||
|
||||
#else // no AVX
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t BlockHdr[22], FinalPoW[8];
|
||||
CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
|
||||
CacheEntry Cache;
|
||||
uint32_t CollisionCount = 0;
|
||||
int threadNumber = mythr->id;
|
||||
|
||||
swab32_array( BlockHdr, pdata, 20 );
|
||||
// Search for pattern in psuedorandom data
|
||||
int searchNumber = COMPARE_SIZE / opt_n_threads;
|
||||
int startLoc = threadNumber * searchNumber;
|
||||
|
||||
if ( opt_debug )
|
||||
applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
|
||||
|
||||
for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
|
||||
{
|
||||
// copy data to first l2 cache
|
||||
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
|
||||
for(int j = 0; j < AES_ITERATIONS; j++)
|
||||
{
|
||||
CacheEntry TmpXOR;
|
||||
__m128i ExpKey[16];
|
||||
|
||||
// use last 4 bytes of first cache as next location
|
||||
uint32_t nextLocation = Cache.dwords[(GARBAGE_SLICE_SIZE >> 2)
|
||||
- 1] & (COMPARE_SIZE - 1); //% COMPARE_SIZE;
|
||||
|
||||
// Copy data from indicated location to second l2 cache -
|
||||
memcpy(&TmpXOR, Garbage + nextLocation, GARBAGE_SLICE_SIZE);
|
||||
//XOR location data into second cache
|
||||
for( int i = 0; i < (GARBAGE_SLICE_SIZE >> 4); ++i )
|
||||
TmpXOR.dqwords[i] = _mm_xor_si128( Cache.dqwords[i],
|
||||
TmpXOR.dqwords[i] );
|
||||
// Key is last 32b of TmpXOR
|
||||
// IV is last 16b of TmpXOR
|
||||
|
||||
ExpandAESKey256( ExpKey, TmpXOR.dqwords +
|
||||
(GARBAGE_SLICE_SIZE / sizeof(__m128i)) - 2 );
|
||||
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
|
||||
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
|
||||
- 1 ], 256 ); }
|
||||
// use last X bits as solution
|
||||
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
|
||||
& (COMPARE_SIZE - 1) ) < 1000 )
|
||||
{
|
||||
BlockHdr[20] = k;
|
||||
BlockHdr[21] = Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 2 ];
|
||||
sha256d( (uint8_t *)FinalPoW, (uint8_t *)BlockHdr, 88 );
|
||||
CollisionCount++;
|
||||
if( FinalPoW[7] <= ptarget[7] )
|
||||
{
|
||||
pdata[20] = swab32( BlockHdr[20] );
|
||||
pdata[21] = swab32( BlockHdr[21] );
|
||||
*hashes_done = CollisionCount;
|
||||
submit_solution( work, FinalPoW, mythr );
|
||||
return(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = CollisionCount;
|
||||
return(0);
|
||||
|
||||
#endif // AVX else
|
||||
|
||||
}
|
||||
|
||||
void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
|
||||
{
|
||||
uint32_t BlockHdr[20], MidHash[8];
|
||||
swab32_array( BlockHdr, pdata, 20 );
|
||||
sha256d((uint8_t *)MidHash, (uint8_t *)BlockHdr, 80);
|
||||
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
|
||||
}
|
||||
|
||||
#endif // AES
|
||||
|
@@ -1,27 +0,0 @@
|
||||
#ifndef __HODL_H
|
||||
#define __HODL_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
#include "miner.h"
|
||||
|
||||
#define AES_ITERATIONS 15
|
||||
|
||||
#define GARBAGE_SIZE (1 << 30)
|
||||
#define GARBAGE_CHUNK_SIZE (1 << 6)
|
||||
#define GARBAGE_SLICE_SIZE (1 << 12)
|
||||
#define TOTAL_CHUNKS (1 << 24) // GARBAGE_SIZE / GARBAGE_CHUNK_SIZE
|
||||
#define COMPARE_SIZE (1 << 18) // GARBAGE_SIZE / GARBAGE_SLICE_SIZE
|
||||
|
||||
typedef union _CacheEntry
|
||||
{
|
||||
uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
|
||||
v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
|
||||
} CacheEntry;
|
||||
|
||||
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);
|
||||
|
||||
#endif // __HODL_H
|
@@ -1,208 +0,0 @@
|
||||
.TH MINERD 1 "March 2016" "cpuminer 2.4.3"
|
||||
.SH NAME
|
||||
hodlminer \- CPU miner for Hodlcoin
|
||||
.SH SYNOPSIS
|
||||
.B hodlminer
|
||||
[\fIOPTION\fR]...
|
||||
.SH DESCRIPTION
|
||||
.B hodlminer
|
||||
is a multi-threaded CPU miner for Hodlcoin.
|
||||
It supports the getwork and getblocktemplate (BIP 22) methods,
|
||||
as well as the Stratum mining protocol.
|
||||
.PP
|
||||
In its normal mode of operation, \fBhodlminer\fR connects to a mining server
|
||||
(specified with the \fB\-o\fR option), receives work from it and starts hashing.
|
||||
As soon as a solution is found, it is submitted to the same mining server,
|
||||
which can accept or reject it.
|
||||
When using getwork or getblocktemplate,
|
||||
\fBhodlminer\fR can take advantage of long polling, if the server supports it;
|
||||
in any case, fresh work is fetched as needed.
|
||||
When using the Stratum protocol this is not possible,
|
||||
and the server is responsible for sending fresh work at least every minute;
|
||||
if it fails to do so,
|
||||
\fBhodlminer\fR may drop the connection and try reconnecting again.
|
||||
.PP
|
||||
By default, \fBhodlminer\fR writes all its messages to standard error.
|
||||
On systems that have a syslog, the \fB\-\-syslog\fR option can be used
|
||||
to write to it instead.
|
||||
.PP
|
||||
On start, the nice value of all miner threads is set to 19.
|
||||
On Linux, the scheduling policy is also changed to SCHED_IDLE,
|
||||
or to SCHED_BATCH if that fails.
|
||||
On multiprocessor systems, \fBhodlminer\fR
|
||||
automatically sets the CPU affinity of miner threads
|
||||
if the number of threads is a multiple of the number of processors.
|
||||
.SH EXAMPLES
|
||||
To connect to the Hodlcoin mining pool that provides a Stratum server
|
||||
at hodl.blockquarry.com on port 8332, authenticating as worker "user.worker" with password "x":
|
||||
.PP
|
||||
.nf
|
||||
.RS
|
||||
hodlminer \-o stratum+tcp://hodl.blockquarry.com:8332 \-u user.worker -p x -q
|
||||
.RE
|
||||
.fi
|
||||
.PP
|
||||
To mine to a local Hodlcoin instance running on port 18332,
|
||||
authenticating with username "rpcuser" and password "rpcpass":
|
||||
.PP
|
||||
.nf
|
||||
.RS
|
||||
hodlminer \-a hodl \-o http://localhost:18332 \-O rpcuser:rpcpass \\
|
||||
\-\-coinbase\-addr=mpXwg4jMtRhuSpVq4xS3HFHmCmWp9NyGKt
|
||||
.RE
|
||||
.fi
|
||||
.PP
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR
|
||||
Set the hashing algorithm to use.
|
||||
Default is hodl.
|
||||
Possible values are:
|
||||
.RS 11
|
||||
.TP 10
|
||||
.B hodl
|
||||
.TP
|
||||
\fB\-\-benchmark\fR
|
||||
Run in offline benchmark mode.
|
||||
.TP
|
||||
\fB\-B\fR, \fB\-\-background\fR
|
||||
Run in the background as a daemon.
|
||||
.TP
|
||||
\fB\-\-cert\fR=\fIFILE\fR
|
||||
Set an SSL certificate to use with the mining server.
|
||||
Only supported when using the HTTPS protocol.
|
||||
.TP
|
||||
\fB\-\-coinbase\-addr\fR=\fIADDRESS\fR
|
||||
Set a payout address for solo mining.
|
||||
This is only used in getblocktemplate mode,
|
||||
and only if the server does not provide a coinbase transaction.
|
||||
.TP
|
||||
\fB\-\-coinbase\-sig\fR=\fITEXT\fR
|
||||
Set a string to be included in the coinbase (if allowed by the server).
|
||||
This is only used in getblocktemplate mode.
|
||||
.TP
|
||||
\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR
|
||||
Load options from a configuration file.
|
||||
\fIFILE\fR must contain a JSON object
|
||||
mapping long options to their arguments (as strings),
|
||||
or to \fBtrue\fR if no argument is required.
|
||||
Sample configuration file:
|
||||
|
||||
.nf
|
||||
{
|
||||
"url": "stratum+tcp://hodl.blockquarry.com:8332",
|
||||
"userpass": "foo:bar",
|
||||
"retry-pause": "10",
|
||||
"quiet": true
|
||||
}
|
||||
.fi
|
||||
.TP
|
||||
\fB\-D\fR, \fB\-\-debug\fR
|
||||
Enable debug output.
|
||||
.TP
|
||||
\fB\-h\fR, \fB\-\-help\fR
|
||||
Print a help message and exit.
|
||||
.TP
|
||||
\fB\-\-no\-gbt\fR
|
||||
Do not use the getblocktemplate RPC method.
|
||||
.TP
|
||||
\fB\-\-no\-getwork\fR
|
||||
Do not use the getwork RPC method.
|
||||
.TP
|
||||
\fB\-\-no\-longpoll\fR
|
||||
Do not use long polling.
|
||||
.TP
|
||||
\fB\-\-no\-redirect\fR
|
||||
Ignore requests from the server to switch to a different URL.
|
||||
.TP
|
||||
\fB\-\-no\-stratum\fR
|
||||
Do not switch to Stratum, even if the server advertises support for it.
|
||||
.TP
|
||||
\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR]
|
||||
Set the URL of the mining server to connect to.
|
||||
Supported schemes are \fBhttp\fR, \fBhttps\fR, \fBstratum+tcp\fR
|
||||
and \fBstratum+tcps\fR.
|
||||
If no scheme is specified, http is assumed.
|
||||
Specifying a \fIPATH\fR is only supported for HTTP and HTTPS.
|
||||
Specifying credentials has the same effect as using the \fB\-O\fR option.
|
||||
|
||||
By default, on HTTP and HTTPS,
|
||||
the miner tries to use the getblocktemplate RPC method,
|
||||
and falls back to using getwork if getblocktemplate is unavailable.
|
||||
This behavior can be modified by using the \fB\-\-no\-gbt\fR
|
||||
and \fB\-\-no\-getwork\fR options.
|
||||
.TP
|
||||
\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR
|
||||
Set the credentials to use for connecting to the mining server.
|
||||
Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded.
|
||||
.TP
|
||||
\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR
|
||||
Set the password to use for connecting to the mining server.
|
||||
Any password previously set with \fB\-O\fR is discarded.
|
||||
.TP
|
||||
\fB\-P\fR, \fB\-\-protocol\-dump\fR
|
||||
Enable output of all protocol-level activities.
|
||||
.TP
|
||||
\fB\-q\fR, \fB\-\-quiet\fR
|
||||
Disable per-thread hashmeter output.
|
||||
.TP
|
||||
\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR
|
||||
Set the maximum number of times to retry if a network call fails.
|
||||
If not specified, the miner will retry indefinitely.
|
||||
.TP
|
||||
\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR
|
||||
Set how long to wait between retries. Default is 30 seconds.
|
||||
.TP
|
||||
\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR
|
||||
Set an upper bound on the time the miner can go without fetching fresh work.
|
||||
This setting has no effect in Stratum mode or when long polling is activated.
|
||||
Default is 5 seconds.
|
||||
.TP
|
||||
\fB\-S\fR, \fB\-\-syslog\fR
|
||||
Log to the syslog facility instead of standard error.
|
||||
.TP
|
||||
\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR
|
||||
Set the number of miner threads.
|
||||
If not specified, the miner will try to detect the number of available processors
|
||||
and use that.
|
||||
.TP
|
||||
\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR
|
||||
Set a timeout for long polling.
|
||||
.TP
|
||||
\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR
|
||||
Set the username to use for connecting to the mining server.
|
||||
Any username previously set with \fB\-O\fR is discarded.
|
||||
.TP
|
||||
\fB\-V\fR, \fB\-\-version\fR
|
||||
Display version information and quit.
|
||||
.TP
|
||||
\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR
|
||||
Connect to the mining server through a proxy.
|
||||
Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR.
|
||||
Since libcurl 7.18.0, the following are also supported:
|
||||
\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving).
|
||||
If no scheme is specified, the proxy is treated as an HTTP proxy.
|
||||
.SH ENVIRONMENT
|
||||
The following environment variables can be specified in lower case or upper case;
|
||||
the lower-case version has precedence. \fBhttp_proxy\fR is an exception
|
||||
as it is only available in lower case.
|
||||
.PP
|
||||
.RS
|
||||
.TP
|
||||
\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
|
||||
Sets the proxy server to use for HTTP.
|
||||
.TP
|
||||
\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
|
||||
Sets the proxy server to use for HTTPS.
|
||||
.TP
|
||||
\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR
|
||||
Sets the proxy server to use if no protocol-specific proxy is set.
|
||||
.RE
|
||||
.PP
|
||||
Using an environment variable to set the proxy has the same effect as
|
||||
using the \fB\-x\fR option.
|
||||
.SH AUTHOR
|
||||
Most of the code in the current version of minerd was written by
|
||||
Pooler <pooler@litecoinpool.org> with contributions from others.
|
||||
|
||||
The original minerd was written by Jeff Garzik <jeff@garzik.org>.
|
@@ -1,50 +0,0 @@
|
||||
#ifndef _SHA512_H
|
||||
#define _SHA512_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
//SHA-512 block size
|
||||
#define SHA512_BLOCK_SIZE 128
|
||||
//SHA-512 digest size
|
||||
#define SHA512_DIGEST_SIZE 64
|
||||
|
||||
/*
|
||||
#ifndef __AVX2__
|
||||
#ifndef __AVX__
|
||||
#error "Either AVX or AVX2 supported needed"
|
||||
#endif // __AVX__
|
||||
#endif // __AVX2__
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
#ifdef __AVX2__
|
||||
__m256i h[8];
|
||||
__m256i w[80];
|
||||
#elif defined(__SSE4_2__)
|
||||
//#elif defined(__AVX__)
|
||||
v128_t h[8];
|
||||
v128_t w[80];
|
||||
#else
|
||||
int dummy;
|
||||
#endif
|
||||
} Sha512Context;
|
||||
|
||||
#ifdef __AVX2__
|
||||
#define SHA512_PARALLEL_N 8
|
||||
#elif defined(__SSE4_2__)
|
||||
//#elif defined(__AVX__)
|
||||
#define SHA512_PARALLEL_N 4
|
||||
#else
|
||||
#define SHA512_PARALLEL_N 1 // dummy value
|
||||
#endif
|
||||
|
||||
//SHA-512 related functions
|
||||
void sha512Compute32b_parallel(
|
||||
uint64_t *data[SHA512_PARALLEL_N],
|
||||
uint64_t *digest[SHA512_PARALLEL_N]);
|
||||
|
||||
void sha512ProcessBlock(Sha512Context contexti[2] );
|
||||
|
||||
#endif
|
@@ -1,235 +0,0 @@
|
||||
#ifndef __AVX2__
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
//Dependencies
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
#include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#if defined(__CYGWIN__)
|
||||
#include <endian.h>
|
||||
#endif
|
||||
|
||||
#include "tmmintrin.h"
|
||||
#include "smmintrin.h"
|
||||
|
||||
#include "sha512-avx.h"
|
||||
#if ((defined(_WIN64) || defined(__WINDOWS__)))
|
||||
#include "hodl-endian.h"
|
||||
#endif
|
||||
|
||||
//SHA-512 auxiliary functions
|
||||
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
|
||||
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
|
||||
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
|
||||
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
|
||||
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
|
||||
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
|
||||
|
||||
//Rotate right operation
|
||||
#define ROR64(a, n) _mm_or_si128(_mm_srli_epi64(a, n), _mm_slli_epi64(a, 64 - n))
|
||||
|
||||
//Shift right operation
|
||||
#define SHR64(a, n) _mm_srli_epi64(a, n)
|
||||
|
||||
__m128i mm_htobe_epi64(__m128i a) {
|
||||
__m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
return _mm_shuffle_epi8(a, mask);
|
||||
}
|
||||
|
||||
__m128i mm_betoh_epi64(__m128i a) {
|
||||
return mm_htobe_epi64(a);
|
||||
}
|
||||
|
||||
//SHA-512 padding
|
||||
static const uint8_t padding[128] =
|
||||
{
|
||||
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
//SHA-512 constants
|
||||
static const uint64_t k[80] =
|
||||
{
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
};
|
||||
|
||||
|
||||
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
|
||||
Sha512Context context[2];
|
||||
context[0].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
|
||||
context[0].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
|
||||
context[0].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
|
||||
context[0].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
|
||||
context[0].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
|
||||
context[0].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
|
||||
context[0].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
|
||||
context[0].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
|
||||
|
||||
context[1].h[0] = _mm_set1_epi64x(0x6A09E667F3BCC908);
|
||||
context[1].h[1] = _mm_set1_epi64x(0xBB67AE8584CAA73B);
|
||||
context[1].h[2] = _mm_set1_epi64x(0x3C6EF372FE94F82B);
|
||||
context[1].h[3] = _mm_set1_epi64x(0xA54FF53A5F1D36F1);
|
||||
context[1].h[4] = _mm_set1_epi64x(0x510E527FADE682D1);
|
||||
context[1].h[5] = _mm_set1_epi64x(0x9B05688C2B3E6C1F);
|
||||
context[1].h[6] = _mm_set1_epi64x(0x1F83D9ABFB41BD6B);
|
||||
context[1].h[7] = _mm_set1_epi64x(0x5BE0CD19137E2179);
|
||||
|
||||
for(int i=0; i<4; ++i) {
|
||||
context[0].w[i] = _mm_set_epi64x ( data[1][i], data[0][i] );
|
||||
context[1].w[i] = _mm_set_epi64x ( data[3][i], data[2][i] );
|
||||
}
|
||||
for(int i=0; i<10; ++i) {
|
||||
context[0].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
|
||||
context[1].w[i+4] = _mm_set1_epi64x( ((uint64_t*)padding)[i] );
|
||||
}
|
||||
|
||||
//Length of the original message (before padding)
|
||||
uint64_t totalSize = 32 * 8;
|
||||
|
||||
//Append the length of the original message
|
||||
context[0].w[14] = _mm_set1_epi64x(0);
|
||||
context[0].w[15] = _mm_set1_epi64x(htobe64(totalSize));
|
||||
|
||||
context[1].w[14] = _mm_set1_epi64x(0);
|
||||
context[1].w[15] = _mm_set1_epi64x(htobe64(totalSize));
|
||||
|
||||
//Calculate the message digest
|
||||
sha512ProcessBlock(context);
|
||||
|
||||
//Convert from host byte order to big-endian byte order
|
||||
for (int i = 0; i < 8; i++) {
|
||||
context[0].h[i] = mm_htobe_epi64(context[0].h[i]);
|
||||
context[1].h[i] = mm_htobe_epi64(context[1].h[i]);
|
||||
}
|
||||
|
||||
//Copy the resulting digest
|
||||
for(int i=0; i<8; ++i) {
|
||||
digest[0][i] = _mm_extract_epi64(context[0].h[i], 0);
|
||||
digest[1][i] = _mm_extract_epi64(context[0].h[i], 1);
|
||||
digest[2][i] = _mm_extract_epi64(context[1].h[i], 0);
|
||||
digest[3][i] = _mm_extract_epi64(context[1].h[i], 1);
|
||||
}
|
||||
}
|
||||
|
||||
#define blk0(n, i) (block[n][i] = mm_betoh_epi64(block[n][i]))
|
||||
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
|
||||
SIGMA4(block[n][i - 2]) + block[n][i - 7])
|
||||
|
||||
#define ROUND512(a,b,c,d,e,f,g,h) \
|
||||
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
|
||||
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
|
||||
(d[0]) += T0; \
|
||||
(d[1]) += T1; \
|
||||
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
|
||||
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
|
||||
i++
|
||||
|
||||
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
|
||||
T0 = blk0(0, i); \
|
||||
T1 = blk0(1, i); \
|
||||
ROUND512(a,b,c,d,e,f,g,h)
|
||||
|
||||
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
|
||||
T0 = blk(0, i); \
|
||||
T1 = blk(1, i); \
|
||||
ROUND512(a,b,c,d,e,f,g,h)
|
||||
|
||||
#define R512_0 \
|
||||
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
|
||||
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
|
||||
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
|
||||
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
|
||||
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
|
||||
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
|
||||
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
|
||||
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
|
||||
|
||||
#define R512_16 \
|
||||
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
|
||||
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
|
||||
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
|
||||
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
|
||||
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
|
||||
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
|
||||
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
|
||||
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
|
||||
|
||||
#define INIT(x,n) \
|
||||
x[0] = context[0].h[n]; \
|
||||
x[1] = context[1].h[n]; \
|
||||
|
||||
void sha512ProcessBlock(Sha512Context context[2])
|
||||
{
|
||||
__m128i* block[2];
|
||||
block[0] = context[0].w;
|
||||
block[1] = context[1].w;
|
||||
|
||||
__m128i T0, T1;
|
||||
__m128i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
|
||||
INIT(a, 0)
|
||||
INIT(b, 1)
|
||||
INIT(c, 2)
|
||||
INIT(d, 3)
|
||||
INIT(e, 4)
|
||||
INIT(f, 5)
|
||||
INIT(g, 6)
|
||||
INIT(h, 7)
|
||||
|
||||
int i = 0;
|
||||
R512_0; R512_0;
|
||||
for(int j=0; j<8; ++j) {
|
||||
R512_16;
|
||||
}
|
||||
|
||||
context[0].h[0] += a[0];
|
||||
context[0].h[1] += b[0];
|
||||
context[0].h[2] += c[0];
|
||||
context[0].h[3] += d[0];
|
||||
context[0].h[4] += e[0];
|
||||
context[0].h[5] += f[0];
|
||||
context[0].h[6] += g[0];
|
||||
context[0].h[7] += h[0];
|
||||
|
||||
context[1].h[0] += a[1];
|
||||
context[1].h[1] += b[1];
|
||||
context[1].h[2] += c[1];
|
||||
context[1].h[3] += d[1];
|
||||
context[1].h[4] += e[1];
|
||||
context[1].h[5] += f[1];
|
||||
context[1].h[6] += g[1];
|
||||
context[1].h[7] += h[1];
|
||||
}
|
||||
|
||||
#endif // __AVX__
|
||||
#endif // __AVX2__
|
@@ -1,241 +0,0 @@
|
||||
#ifdef __AVX2__
|
||||
|
||||
//Dependencies
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
#include <sys/endian.h>
|
||||
#endif
|
||||
|
||||
#if defined(__CYGWIN__)
|
||||
#include <endian.h>
|
||||
#endif
|
||||
|
||||
#include "tmmintrin.h"
|
||||
#include "smmintrin.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
#include "sha512-avx.h"
|
||||
#if ((defined(_WIN64) || defined(__WINDOWS__)))
|
||||
#include "hodl-endian.h"
|
||||
#endif
|
||||
|
||||
//SHA-512 auxiliary functions
|
||||
#define Ch(x, y, z) (((x) & (y)) | (~(x) & (z)))
|
||||
#define Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
|
||||
#define SIGMA1(x) (ROR64(x, 28) ^ ROR64(x, 34) ^ ROR64(x, 39))
|
||||
#define SIGMA2(x) (ROR64(x, 14) ^ ROR64(x, 18) ^ ROR64(x, 41))
|
||||
#define SIGMA3(x) (ROR64(x, 1) ^ ROR64(x, 8) ^ SHR64(x, 7))
|
||||
#define SIGMA4(x) (ROR64(x, 19) ^ ROR64(x, 61) ^ SHR64(x, 6))
|
||||
|
||||
//Rotate right operation
|
||||
#define ROR64(a, n) _mm256_or_si256(_mm256_srli_epi64(a, n), _mm256_slli_epi64(a, 64 - n))
|
||||
|
||||
//Shift right operation
|
||||
#define SHR64(a, n) _mm256_srli_epi64(a, n)
|
||||
|
||||
__m256i mm256_htobe_epi64(__m256i a) {
|
||||
__m256i mask = _mm256_set_epi8(
|
||||
24,25,26,27,28,29,30,31,
|
||||
16,17,18,19,20,21,22,23,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7);
|
||||
return _mm256_shuffle_epi8(a, mask);
|
||||
}
|
||||
|
||||
__m256i mm256_betoh_epi64(__m256i a) {
|
||||
return mm256_htobe_epi64(a);
|
||||
}
|
||||
|
||||
//SHA-512 padding
|
||||
static const uint8_t padding[128] =
|
||||
{
|
||||
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
//SHA-512 constants
|
||||
static const uint64_t k[80] =
|
||||
{
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||
};
|
||||
|
||||
|
||||
void sha512Compute32b_parallel(uint64_t *data[SHA512_PARALLEL_N], uint64_t *digest[SHA512_PARALLEL_N]) {
|
||||
Sha512Context context[2];
|
||||
context[0].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
|
||||
context[0].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
|
||||
context[0].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
|
||||
context[0].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
|
||||
context[0].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
|
||||
context[0].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
|
||||
context[0].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
|
||||
context[0].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
|
||||
|
||||
context[1].h[0] = _mm256_set1_epi64x(0x6A09E667F3BCC908);
|
||||
context[1].h[1] = _mm256_set1_epi64x(0xBB67AE8584CAA73B);
|
||||
context[1].h[2] = _mm256_set1_epi64x(0x3C6EF372FE94F82B);
|
||||
context[1].h[3] = _mm256_set1_epi64x(0xA54FF53A5F1D36F1);
|
||||
context[1].h[4] = _mm256_set1_epi64x(0x510E527FADE682D1);
|
||||
context[1].h[5] = _mm256_set1_epi64x(0x9B05688C2B3E6C1F);
|
||||
context[1].h[6] = _mm256_set1_epi64x(0x1F83D9ABFB41BD6B);
|
||||
context[1].h[7] = _mm256_set1_epi64x(0x5BE0CD19137E2179);
|
||||
|
||||
for(int i=0; i<4; ++i) {
|
||||
context[0].w[i] = _mm256_set_epi64x ( data[3][i], data[2][i], data[1][i], data[0][i] );
|
||||
context[1].w[i] = _mm256_set_epi64x ( data[7][i], data[6][i], data[5][i], data[4][i] );
|
||||
}
|
||||
for(int i=0; i<10; ++i) {
|
||||
context[0].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
|
||||
context[1].w[i+4] = _mm256_set1_epi64x( ((uint64_t*)padding)[i] );
|
||||
}
|
||||
|
||||
//Length of the original message (before padding)
|
||||
uint64_t totalSize = 32 * 8;
|
||||
|
||||
//Append the length of the original message
|
||||
context[0].w[14] = _mm256_set1_epi64x(0);
|
||||
context[0].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
|
||||
|
||||
context[1].w[14] = _mm256_set1_epi64x(0);
|
||||
context[1].w[15] = _mm256_set1_epi64x(htobe64(totalSize));
|
||||
|
||||
//Calculate the message digest
|
||||
sha512ProcessBlock(context);
|
||||
|
||||
//Convert from host byte order to big-endian byte order
|
||||
for (int i = 0; i < 8; i++) {
|
||||
context[0].h[i] = mm256_htobe_epi64(context[0].h[i]);
|
||||
context[1].h[i] = mm256_htobe_epi64(context[1].h[i]);
|
||||
}
|
||||
|
||||
//Copy the resulting digest
|
||||
for(int i=0; i<8; ++i) {
|
||||
digest[0][i] = _mm256_extract_epi64(context[0].h[i], 0);
|
||||
digest[1][i] = _mm256_extract_epi64(context[0].h[i], 1);
|
||||
digest[2][i] = _mm256_extract_epi64(context[0].h[i], 2);
|
||||
digest[3][i] = _mm256_extract_epi64(context[0].h[i], 3);
|
||||
|
||||
digest[4][i] = _mm256_extract_epi64(context[1].h[i], 0);
|
||||
digest[5][i] = _mm256_extract_epi64(context[1].h[i], 1);
|
||||
digest[6][i] = _mm256_extract_epi64(context[1].h[i], 2);
|
||||
digest[7][i] = _mm256_extract_epi64(context[1].h[i], 3);
|
||||
}
|
||||
}
|
||||
|
||||
#define blk0(n, i) (block[n][i] = mm256_betoh_epi64(block[n][i]))
|
||||
#define blk(n, i) (block[n][i] = block[n][i - 16] + SIGMA3(block[n][i - 15]) + \
|
||||
SIGMA4(block[n][i - 2]) + block[n][i - 7])
|
||||
|
||||
#define ROUND512(a,b,c,d,e,f,g,h) \
|
||||
T0 += (h[0]) + SIGMA2(e[0]) + Ch((e[0]), (f[0]), (g[0])) + k[i]; \
|
||||
T1 += (h[1]) + SIGMA2(e[1]) + Ch((e[1]), (f[1]), (g[1])) + k[i]; \
|
||||
(d[0]) += T0; \
|
||||
(d[1]) += T1; \
|
||||
(h[0]) = T0 + SIGMA1(a[0]) + Maj((a[0]), (b[0]), (c[0])); \
|
||||
(h[1]) = T1 + SIGMA1(a[1]) + Maj((a[1]), (b[1]), (c[1])); \
|
||||
i++
|
||||
|
||||
#define ROUND512_0_TO_15(a,b,c,d,e,f,g,h) \
|
||||
T0 = blk0(0, i); \
|
||||
T1 = blk0(1, i); \
|
||||
ROUND512(a,b,c,d,e,f,g,h)
|
||||
|
||||
#define ROUND512_16_TO_80(a,b,c,d,e,f,g,h) \
|
||||
T0 = blk(0, i); \
|
||||
T1 = blk(1, i); \
|
||||
ROUND512(a,b,c,d,e,f,g,h)
|
||||
|
||||
#define R512_0 \
|
||||
ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
|
||||
ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
|
||||
ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
|
||||
ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
|
||||
ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
|
||||
ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
|
||||
ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
|
||||
ROUND512_0_TO_15(b, c, d, e, f, g, h, a)
|
||||
|
||||
#define R512_16 \
|
||||
ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
|
||||
ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
|
||||
ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
|
||||
ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
|
||||
ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
|
||||
ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
|
||||
ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
|
||||
ROUND512_16_TO_80(b, c, d, e, f, g, h, a)
|
||||
|
||||
#define INIT(x,n) \
|
||||
x[0] = context[0].h[n]; \
|
||||
x[1] = context[1].h[n]; \
|
||||
|
||||
void sha512ProcessBlock(Sha512Context context[2])
|
||||
{
|
||||
__m256i* block[2];
|
||||
block[0] = context[0].w;
|
||||
block[1] = context[1].w;
|
||||
|
||||
__m256i T0, T1;
|
||||
__m256i a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2];
|
||||
INIT(a, 0)
|
||||
INIT(b, 1)
|
||||
INIT(c, 2)
|
||||
INIT(d, 3)
|
||||
INIT(e, 4)
|
||||
INIT(f, 5)
|
||||
INIT(g, 6)
|
||||
INIT(h, 7)
|
||||
|
||||
int i = 0;
|
||||
R512_0; R512_0;
|
||||
for(int j=0; j<8; ++j) {
|
||||
R512_16;
|
||||
}
|
||||
|
||||
context[0].h[0] += a[0];
|
||||
context[0].h[1] += b[0];
|
||||
context[0].h[2] += c[0];
|
||||
context[0].h[3] += d[0];
|
||||
context[0].h[4] += e[0];
|
||||
context[0].h[5] += f[0];
|
||||
context[0].h[6] += g[0];
|
||||
context[0].h[7] += h[0];
|
||||
|
||||
context[1].h[0] += a[1];
|
||||
context[1].h[1] += b[1];
|
||||
context[1].h[2] += c[1];
|
||||
context[1].h[3] += d[1];
|
||||
context[1].h[4] += e[1];
|
||||
context[1].h[5] += f[1];
|
||||
context[1].h[6] += g[1];
|
||||
context[1].h[7] += h[1];
|
||||
}
|
||||
|
||||
#endif // __AVX2__
|
@@ -1,25 +0,0 @@
|
||||
#ifndef __WOLF_AES_H
|
||||
#define __WOLF_AES_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
//#ifdef __AVX__
|
||||
|
||||
#define AES_PARALLEL_N 8
|
||||
#define BLOCK_COUNT 256
|
||||
|
||||
void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
|
||||
v128_t* IV );
|
||||
|
||||
#else
|
||||
|
||||
void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
|
||||
const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
|
||||
|
||||
#endif
|
||||
|
||||
#endif // __WOLF_AES_H
|
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
|
||||
(state)->H[15] = h7l; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
{ \
|
||||
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_256, not used with AVX512VL
|
||||
#if defined(VL256)
|
||||
|
||||
#define notxorandnot( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
||||
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
void jh256_8x64_init( jh_8x64_context *sc )
|
||||
{
|
||||
@@ -852,48 +851,10 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
|
||||
|
||||
// SSE2 & NEON
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_256, not used with AVX512VL
|
||||
|
||||
#define v128_notxorandnot( a, b, c ) \
|
||||
_mm_ternarylogic_epi64( a, b, c, 0x2d )
|
||||
|
||||
#else
|
||||
|
||||
#define v128_notxorandnot( a, b, c ) \
|
||||
v128_xor( v128_not( a ), v128_andnot( b, c ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
{ \
|
||||
v128u64_t cc = v128_64( c ); \
|
||||
x3 = v128_not( x3 ); \
|
||||
x0 = v128_xor( x0, v128_andnot( x2, cc ) ); \
|
||||
tmp = v128_xor( cc, v128_and( x0, x1 ) ); \
|
||||
x0 = v128_xor( x0, v128_and( x2, x3 ) ); \
|
||||
x3 = v128_xor( x3, v128_andnot( x1, x2 ) ); \
|
||||
x1 = v128_xor( x1, v128_and( x0, x2 ) ); \
|
||||
x2 = v128_xor( x2, v128_andnot( x3, x0 ) ); \
|
||||
x0 = v128_xor( x0, v128_or( x1, x3 ) ); \
|
||||
x3 = v128_xor( x3, v128_and( x1, x2 ) ); \
|
||||
x1 = v128_xor( x1, v128_and( tmp, x0 ) ); \
|
||||
x2 = v128_xor( x2, tmp ); \
|
||||
}
|
||||
|
||||
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
{ \
|
||||
x4 = v128_xor( x4, x1 ); \
|
||||
x5 = v128_xor( x5, x2 ); \
|
||||
x6 = v128_xor( x6, v128_xor( x3, x0 ) ); \
|
||||
x7 = v128_xor( x7, x0 ); \
|
||||
x0 = v128_xor( x0, x5 ); \
|
||||
x1 = v128_xor( x1, x6 ); \
|
||||
x2 = v128_xor( x2, v128_xor( x7, x4 ) ); \
|
||||
x3 = v128_xor( x3, x4 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
{ \
|
||||
const v128u64_t cc = v128_64( c ); \
|
||||
@@ -920,7 +881,6 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
|
||||
x2 = v128_xor3( x2, x7, x4 ); \
|
||||
x3 = v128_xor( x3, x4 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#undef Wz
|
||||
#define Wz(x, c, n) \
|
||||
|
@@ -55,7 +55,7 @@
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_2WAY)
|
||||
|
||||
void keccakhash_2x64(void *state, const void *input)
|
||||
{
|
||||
keccak256_2x64_context ctx;
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, input, 80 );
|
||||
keccak256_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
keccakhash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ))
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#elif defined (KECCAK_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||
gate->hash = (void*)&keccakhash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||
gate->hash = (void*)&keccakhash_4way;
|
||||
#elif defined (KECCAK_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||
gate->hash = (void*)&keccakhash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_keccak;
|
||||
gate->hash = (void*)&keccakhash;
|
||||
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
bool register_sha3d_algo( algo_gate_t* gate )
|
||||
{
|
||||
hard_coded_eb = 6;
|
||||
// opt_extranonce = false;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
|
||||
#if defined (KECCAK_8WAY)
|
||||
#if defined (SHA3D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_8way;
|
||||
gate->hash = (void*)&sha3d_hash_8way;
|
||||
#elif defined (KECCAK_4WAY)
|
||||
#elif defined (SHA3D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_4way;
|
||||
gate->hash = (void*)&sha3d_hash_4way;
|
||||
#elif defined (SHA3D_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_2x64;
|
||||
gate->hash = (void*)&sha3d_hash_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha3d;
|
||||
gate->hash = (void*)&sha3d_hash;
|
||||
|
@@ -4,10 +4,20 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define KECCAK_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define KECCAK_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define KECCAK_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SIMD512)
|
||||
#define SHA3D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA3D_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA3D_2WAY 1
|
||||
#endif
|
||||
|
||||
extern int hard_coded_eb;
|
||||
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
|
||||
|
||||
void keccakhash_8way( void *state, const void *input );
|
||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_8way( void *state, const void *input );
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void keccakhash_4way( void *state, const void *input );
|
||||
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_4way( void *state, const void *input );
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#elif defined(KECCAK_2WAY)
|
||||
|
||||
void keccakhash_2x64( void *state, const void *input );
|
||||
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void keccakhash( void *state, const void *input );
|
||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA3D_8WAY)
|
||||
|
||||
void sha3d_hash_8way( void *state, const void *input );
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SHA3D_4WAY)
|
||||
|
||||
void sha3d_hash_4way( void *state, const void *input );
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SHA3D_2WAY)
|
||||
|
||||
void sha3d_hash_2x64( void *state, const void *input );
|
||||
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void sha3d_hash( void *state, const void *input );
|
||||
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
|
||||
|
||||
#define DO(x) x
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
size_t byte_len, size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m512i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
|
||||
size_t j;
|
||||
size_t m512_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm512_set1_epi64( t );
|
||||
tmp[0] = _mm512_set1_epi64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
||||
keccak64_8way_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
@@ -194,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
memcpy_512( dst, kc->w, m512_len );
|
||||
}
|
||||
|
||||
void keccak256_8way_init( void *kc )
|
||||
void keccak256_8x64_init( void *kc )
|
||||
{
|
||||
keccak64_8way_init( kc, 256 );
|
||||
}
|
||||
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
__m256i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
|
||||
size_t j;
|
||||
size_t m256_len = byte_len >> 3;
|
||||
const unsigned eb = hard_coded_eb;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = _mm256_set1_epi64x( t );
|
||||
tmp[0] = _mm256_set1_epi64x( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( tmp + 1, (j>>3) - 2 );
|
||||
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_core( kc, u.tmp, j, lim );
|
||||
keccak64_core( kc, tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
@@ -563,7 +555,7 @@ static void keccak64x2_close( keccak64_ctx_v128 *kc, void *dst,
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
v128_t tmp[lim + 1];
|
||||
v128_t tmp[140];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
size_t j;
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@@ -4,12 +4,12 @@
|
||||
#include <stdint.h>
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
#if defined(SHA3D_8WAY)
|
||||
|
||||
void sha3d_hash_8way(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
|
||||
keccak256_8way_context ctx;
|
||||
keccak256_8x64_context ctx;
|
||||
|
||||
keccak256_8x64_init( &ctx );
|
||||
keccak256_8x64_update( &ctx, input, 80 );
|
||||
@@ -64,12 +64,12 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
#elif defined(SHA3D_4WAY)
|
||||
|
||||
void sha3d_hash_4way(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||
keccak256_4way_context ctx;
|
||||
keccak256_4x64_context ctx;
|
||||
|
||||
keccak256_4x64_init( &ctx );
|
||||
keccak256_4x64_update( &ctx, input, 80 );
|
||||
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA3D_2WAY)
|
||||
|
||||
void sha3d_hash_2x64(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||
keccak256_2x64_context ctx;
|
||||
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, input, 80 );
|
||||
keccak256_2x64_close( &ctx, buffer );
|
||||
|
||||
keccak256_2x64_init( &ctx );
|
||||
keccak256_2x64_update( &ctx, buffer, 32 );
|
||||
keccak256_2x64_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
v128_t *noncev = (v128_t*)vdata + 9;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
sha3d_hash_2x64( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
uint32_t hash[8*4] __attribute((aligned(128)));
|
||||
__m512i* chainv = state->chainv;
|
||||
__m512i t[2];
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_4way( state, NULL );
|
||||
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b,0 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
||||
casti_m512i( b,1 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
||||
casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
|
||||
casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
|
||||
|
||||
rnd512_4way( state, NULL );
|
||||
|
||||
@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b,2 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,0 ), shuff_bswap32 );
|
||||
casti_m512i( b,3 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash,1 ), shuff_bswap32 );
|
||||
casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
|
||||
casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
|
||||
}
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[0] = mm512_bswap_32( vdata[0] );
|
||||
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm512_bswap_32( vdata[ 1 ] );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[0] = mm512_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -524,8 +512,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
|
||||
#if defined(__AVX512VL__)
|
||||
#if defined(VL256)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
@@ -776,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
uint32 hash[8*2] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t0, t1;
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
@@ -792,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
@@ -810,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||
|
||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
|
||||
}
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
@@ -848,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
@@ -865,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[0] = mm256_bswap_32( vdata[0] );
|
||||
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
@@ -917,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
@@ -934,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
@@ -962,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bswap_32( vdata[ 1 ] );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
@@ -979,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[0] = mm256_bswap_32( vdata[ 0 ] );
|
||||
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
@@ -51,7 +51,7 @@
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
typedef struct {
|
||||
uint32_t buffer[8*4];
|
||||
|
@@ -28,67 +28,55 @@
|
||||
a = v128_xor( a, c0 ); \
|
||||
b = v128_xor( b, c1 ); \
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, \
|
||||
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
v128_t b = _mm_shuffle_epi32( a1, 0 ); \
|
||||
b = v128_xor( a0, v128_mask32( b, 0x4 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
} while(0)
|
||||
}
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
|
||||
#elif defined(__ARM_NEON) || defined(__SSE2__)
|
||||
|
||||
// { a1_0, 0, a1_0, a1_0 }
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, \
|
||||
v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
|
||||
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
|
||||
a0 = v128_alignr32( a1, b, 1 ); \
|
||||
a1 = v128_alignr32( b, a1, 1 ); \
|
||||
}
|
||||
|
||||
#else // assume SSE2
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
v128_t b = v128_xor( a0, \
|
||||
_mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
|
||||
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#else
|
||||
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
#if defined(VL256)
|
||||
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
v128_t t = a0; \
|
||||
a0 = mm128_xoror( a3, a0, a1 ); \
|
||||
a0 = v128_xoror( a3, a0, a1 ); \
|
||||
a2 = v128_xor( a2, a3 ); \
|
||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm128_xorand( a2, a3, t ); \
|
||||
a2 = mm128_xorand( a1, a2, a0 ); \
|
||||
a3 = v128_xorand( a2, a3, t ); \
|
||||
a2 = v128_xorand( a1, a2, a0 ); \
|
||||
a1 = v128_or( a1, a3 ); \
|
||||
a3 = v128_xor( a3, a2 ); \
|
||||
t = v128_xor( t, a1 ); \
|
||||
a2 = v128_and( a2, a1 ); \
|
||||
a1 = mm128_xnor( a1, a0 ); \
|
||||
a1 = v128_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
@@ -137,8 +125,8 @@ const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
|
||||
t0 = v128_shufll32( a1 ); \
|
||||
a1 = v128_unpacklo32( t0, a0 ); \
|
||||
t0 = v128_unpackhi32( t0, a0 ); \
|
||||
t1 = v128_swap64( t0 ); \
|
||||
a0 = v128_swap64( a1 ); \
|
||||
t1 = v128_rev64( t0 ); \
|
||||
a0 = v128_rev64( a1 ); \
|
||||
SUBCRUMB( t1, t0, a0, a1 ); \
|
||||
t0 = v128_unpacklo32( t0, t1 ); \
|
||||
a1 = v128_unpacklo32( a1, a0 ); \
|
||||
@@ -224,9 +212,10 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
|
||||
};
|
||||
|
||||
|
||||
v128_t CNS128[32];
|
||||
static v128_t CNS128[32];
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
v128_t MASK;
|
||||
static v128_t MASK;
|
||||
#endif
|
||||
|
||||
int init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
@@ -235,13 +224,13 @@ int init_luffa(hashState_luffa *state, int hashbitlen)
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
MASK = v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
|
||||
for ( i=0; i<10; i++ )
|
||||
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
|
||||
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
|
||||
memset(state->buffer, 0, sizeof state->buffer );
|
||||
return 0;
|
||||
}
|
||||
@@ -268,7 +257,7 @@ int update_luffa( hashState_luffa *state, const void *data,
|
||||
// remaining data bytes
|
||||
casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
|
||||
// padding of partial block
|
||||
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
|
||||
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -327,7 +316,6 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
const void* data, size_t inlen )
|
||||
{
|
||||
@@ -336,13 +324,13 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= v128_set64( 0, 0x00000000ffffffff );
|
||||
MASK= v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
|
||||
CNS128[i] = casti_v128( CNS_INIT, i );
|
||||
for ( i=0; i<10; i++ )
|
||||
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
|
||||
state->chainv[i] = casti_v128( IV, i );
|
||||
memset(state->buffer, 0, sizeof state->buffer );
|
||||
|
||||
// update
|
||||
@@ -376,16 +364,15 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
|
||||
{
|
||||
v128_t t0, t1;
|
||||
v128_t *chainv = state->chainv;
|
||||
v128_t x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
v128u32_t t0, t1;
|
||||
v128u32_t *chainv = state->chainv;
|
||||
v128u32_t x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
@@ -472,7 +459,7 @@ static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
|
||||
chainv[5] = v128_rol32( chainv[5], 2 );
|
||||
chainv[7] = v128_rol32( chainv[7], 3 );
|
||||
chainv[9] = v128_rol32( chainv[9], 4 );
|
||||
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
|
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
||||
|
||||
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
const void* data, size_t inlen );
|
||||
#endif // LUFFA_FOR_SSE2_H___
|
||||
#endif // LUFFA_FOR_SSE2_H__
|
||||
|
@@ -11,26 +11,26 @@
|
||||
#endif
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#if !defined(__AES__) // && !defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define ALLIUM_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define ALLIUM_8WAY 1
|
||||
#elif #defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define ALLIUM_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined (ALLIUM_16WAY)
|
||||
|
||||
typedef union {
|
||||
keccak256_8way_context keccak;
|
||||
keccak256_8x64_context keccak;
|
||||
cube_4way_2buf_context cube;
|
||||
skein256_8way_context skein;
|
||||
skein256_8x64_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_context groestl;
|
||||
groestl256_4way_context groestl;
|
||||
#else
|
||||
hashState_groestl256 groestl;
|
||||
#endif
|
||||
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
keccak256_8way_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashB);
|
||||
keccak256_8x64_init( &ctx.keccak );
|
||||
keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8x64_close( &ctx.keccak, vhashA);
|
||||
keccak256_8x64_init( &ctx.keccak );
|
||||
keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_8x64_close( &ctx.keccak, vhashB);
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
skein256_8way_init( &ctx.skein );
|
||||
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhashA );
|
||||
skein256_8way_init( &ctx.skein );
|
||||
skein256_8way_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhashB );
|
||||
skein256_8x64_init( &ctx.skein );
|
||||
skein256_8x64_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_8x64_close( &ctx.skein, vhashA );
|
||||
skein256_8x64_init( &ctx.skein );
|
||||
skein256_8x64_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_8x64_close( &ctx.skein, vhashB );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
|
||||
typedef union {
|
||||
keccak256_4way_context keccak;
|
||||
keccak256_4x64_context keccak;
|
||||
cube_2way_context cube;
|
||||
skein256_4way_context skein;
|
||||
skein256_4x64_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_context groestl;
|
||||
#else
|
||||
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhashA );
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
keccak256_4way_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhashB );
|
||||
keccak256_4x64_init( &ctx.keccak );
|
||||
keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_4x64_close( &ctx.keccak, vhashA );
|
||||
keccak256_4x64_init( &ctx.keccak );
|
||||
keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_4x64_close( &ctx.keccak, vhashB );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
skein256_4way_init( &ctx.skein );
|
||||
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashA );
|
||||
skein256_4way_init( &ctx.skein );
|
||||
skein256_4way_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashB );
|
||||
skein256_4x64_init( &ctx.skein );
|
||||
skein256_4x64_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_4x64_close( &ctx.skein, vhashA );
|
||||
skein256_4x64_init( &ctx.skein );
|
||||
skein256_4x64_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_4x64_close( &ctx.skein, vhashB );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
n+ 3, n+ 2, n+ 1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -465,11 +465,7 @@ typedef union
|
||||
{
|
||||
keccak256_2x64_context keccak;
|
||||
cubehashParam cube;
|
||||
#if defined(__x86_64__)
|
||||
skein256_2x64_context skein;
|
||||
#else
|
||||
sph_skein512_context skein;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
@@ -487,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
||||
uint64_t *hash3 = (uint64_t*)hash+12;
|
||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
|
||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
#if defined(__x86_64__)
|
||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||
skein256_2x64_init( &ctx.skein );
|
||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||
@@ -527,20 +522,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_2x64_close( &ctx.skein, vhashA );
|
||||
dintrlv_2x64( hash2, hash3, vhashA, 256 );
|
||||
#else
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash0, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash0 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash1, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash1 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash2, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash2 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash3, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash3 );
|
||||
#endif
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
||||
@@ -607,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
block_buf[15] = v128_32( 640 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -635,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
//
|
||||
// 1 way
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
blake256_context blake;
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2REV3_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2REV3_8WAY 1
|
||||
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2REV2_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2REV2_8WAY 1
|
||||
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define PHI2_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define PHI2_4WAY 1
|
||||
|
@@ -41,7 +41,7 @@
|
||||
// lyra2z330, lyra2h,
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||
|
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
|
||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
|
||||
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2h_4way_blake_mid;
|
||||
static __thread blake256_4x32_context l2h_4way_blake_mid;
|
||||
|
||||
void lyra2h_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2h_4way_blake_mid );
|
||||
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
|
||||
blake256_4x32_init( &l2h_4way_blake_mid );
|
||||
blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2h_4way_hash( void *state, const void *input )
|
||||
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
|
||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4x32_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
|
||||
lyra2h_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
lyra2h_4way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
|
@@ -7,25 +7,24 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
||||
|
||||
#if defined (LYRA2REV2_16WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
blake256_16x32_context blake;
|
||||
keccak256_8x64_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_8way_context skein;
|
||||
bmw256_16way_context bmw;
|
||||
skein256_8x64_context skein;
|
||||
bmw256_16x32_context bmw;
|
||||
} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
|
||||
|
||||
static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
|
||||
|
||||
bool init_lyra2rev2_16way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &l2v2_16way_ctx.keccak );
|
||||
keccak256_8x64_init( &l2v2_16way_ctx.keccak );
|
||||
cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &l2v2_16way_ctx.skein );
|
||||
bmw256_16way_init( &l2v2_16way_ctx.bmw );
|
||||
skein256_8x64_init( &l2v2_16way_ctx.skein );
|
||||
bmw256_16x32_init( &l2v2_16way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
|
||||
|
||||
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
|
||||
blake256_16way_close( &ctx.blake, vhash );
|
||||
blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
|
||||
blake256_16x32_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7,
|
||||
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
keccak256_8way_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_8x64_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
intrlv_8x64( vhash, hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
keccak256_8way_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
keccak256_8x64_init( &ctx.keccak );
|
||||
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_8x64_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 256 );
|
||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhash );
|
||||
skein256_8x64_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8x64_close( &ctx.skein, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12,
|
||||
hash13, hash14, hash15, 256 );
|
||||
|
||||
skein256_8way_init( &ctx.skein );
|
||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhash );
|
||||
skein256_8x64_init( &ctx.skein );
|
||||
skein256_8x64_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8x64_close( &ctx.skein, vhash );
|
||||
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
||||
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_16way_close( &ctx.bmw, state );
|
||||
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_16x32_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
|
||||
blake256_16way_init( &l2v2_16way_ctx.blake );
|
||||
blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
|
||||
blake256_16x32_init( &l2v2_16way_ctx.blake );
|
||||
blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
||||
#elif defined (LYRA2REV2_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
blake256_8x32_context blake;
|
||||
keccak256_4x64_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
bmw256_8way_context bmw;
|
||||
skein256_4x64_context skein;
|
||||
bmw256_8x32_context bmw;
|
||||
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
|
||||
|
||||
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
|
||||
|
||||
bool init_lyra2rev2_8way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &l2v2_8way_ctx.keccak );
|
||||
keccak256_4x64_init( &l2v2_8way_ctx.keccak );
|
||||
cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &l2v2_8way_ctx.skein );
|
||||
bmw256_8way_init( &l2v2_8way_ctx.bmw );
|
||||
skein256_4x64_init( &l2v2_8way_ctx.skein );
|
||||
bmw256_8x32_init( &l2v2_8way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
||||
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
|
||||
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8x32_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
keccak256_4way_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash );
|
||||
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_4x64_close( &ctx.keccak, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
keccak256_4way_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash );
|
||||
keccak256_4x64_init( &ctx.keccak );
|
||||
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
|
||||
keccak256_4x64_close( &ctx.keccak, vhash );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
||||
LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
skein256_4way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash );
|
||||
skein256_4x64_update( &ctx.skein, vhash, 32 );
|
||||
skein256_4x64_close( &ctx.skein, vhash );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
skein256_4way_init( &ctx.skein );
|
||||
skein256_4way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash );
|
||||
skein256_4x64_init( &ctx.skein );
|
||||
skein256_4x64_update( &ctx.skein, vhash, 32 );
|
||||
skein256_4x64_close( &ctx.skein, vhash );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8way_close( &ctx.bmw, state );
|
||||
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8x32_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
blake256_8way_init( &l2v2_8way_ctx.blake );
|
||||
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
|
||||
blake256_8x32_init( &l2v2_8way_ctx.blake );
|
||||
blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
||||
#elif defined (LYRA2REV2_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
blake256_4x32_context blake;
|
||||
keccak256_4x64_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
bmw256_4way_context bmw;
|
||||
skein256_4x64_context skein;
|
||||
bmw256_4x32_context bmw;
|
||||
} lyra2v2_4way_ctx_holder;
|
||||
|
||||
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
|
||||
|
||||
bool init_lyra2rev2_4way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &l2v2_4way_ctx.keccak );
|
||||
keccak256_4x64_init( &l2v2_4way_ctx.keccak );
|
||||
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &l2v2_4way_ctx.skein );
|
||||
bmw256_4way_init( &l2v2_4way_ctx.bmw );
|
||||
skein256_4x64_init( &l2v2_4way_ctx.skein );
|
||||
bmw256_4x32_init( &l2v2_4way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
|
||||
|
||||
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4x32_close( &ctx.blake, vhash );
|
||||
|
||||
rintrlv_4x32_4x64( vhash64, vhash, 256 );
|
||||
|
||||
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4x64_close( &ctx.keccak, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way_update( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
skein256_4x64_update( &ctx.skein, vhash64, 32 );
|
||||
skein256_4x64_close( &ctx.skein, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4x32_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -451,12 +449,12 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
|
||||
blake256_4way_init( &l2v2_4way_ctx.blake );
|
||||
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
blake256_4x32_init( &l2v2_4way_ctx.blake );
|
||||
blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2rev2_4way_hash( hash, vdata );
|
||||
|
||||
|
@@ -9,18 +9,18 @@
|
||||
#if defined (LYRA2REV3_16WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
blake256_16x32_context blake;
|
||||
cube_4way_context cube;
|
||||
bmw256_16way_context bmw;
|
||||
bmw256_16x32_context bmw;
|
||||
} lyra2v3_16way_ctx_holder;
|
||||
|
||||
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
|
||||
|
||||
bool init_lyra2rev3_16way_ctx()
|
||||
{
|
||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||
blake256_16x32_init( &l2v3_16way_ctx.blake );
|
||||
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
|
||||
bmw256_16way_init( &l2v3_16way_ctx.bmw );
|
||||
bmw256_16x32_init( &l2v3_16way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
|
||||
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
|
||||
|
||||
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
|
||||
blake256_16way_close( &ctx.blake, vhash );
|
||||
blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
|
||||
blake256_16x32_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
|
||||
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_16way_close( &ctx.bmw, state );
|
||||
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_16x32_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
|
||||
@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
|
||||
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
||||
blake256_16x32_init( &l2v3_16way_ctx.blake );
|
||||
blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
||||
#elif defined (LYRA2REV3_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
blake256_8x32_context blake;
|
||||
cubehashParam cube;
|
||||
bmw256_8way_context bmw;
|
||||
bmw256_8x32_context bmw;
|
||||
} lyra2v3_8way_ctx_holder;
|
||||
|
||||
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
|
||||
|
||||
bool init_lyra2rev3_8way_ctx()
|
||||
{
|
||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8x32_init( &l2v3_8way_ctx.blake );
|
||||
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
|
||||
bmw256_8way_init( &l2v3_8way_ctx.bmw );
|
||||
bmw256_8x32_init( &l2v3_8way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
|
||||
|
||||
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
|
||||
blake256_8x32_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
bmw256_8way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8way_close( &ctx.bmw, state );
|
||||
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_8x32_close( &ctx.bmw, state );
|
||||
|
||||
}
|
||||
|
||||
@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
blake256_8x32_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
#if defined (LYRA2REV3_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_4way_context blake;
|
||||
blake256_4x32_context blake;
|
||||
cubehashParam cube;
|
||||
bmw256_4way_context bmw;
|
||||
bmw256_4x32_context bmw;
|
||||
} lyra2v3_4way_ctx_holder;
|
||||
|
||||
//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
|
||||
|
||||
bool init_lyra2rev3_4way_ctx()
|
||||
{
|
||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
||||
blake256_4x32_init( &l2v3_4way_ctx.blake );
|
||||
cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
|
||||
bmw256_4way_init( &l2v3_4way_ctx.bmw );
|
||||
bmw256_4x32_init( &l2v3_4way_ctx.bmw );
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
|
||||
|
||||
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash );
|
||||
blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
|
||||
blake256_4x32_close( &ctx.blake, vhash );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
|
||||
|
||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
bmw256_4way_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4way_close( &ctx.bmw, state );
|
||||
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
|
||||
bmw256_4x32_close( &ctx.bmw, state );
|
||||
}
|
||||
|
||||
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
|
||||
|
||||
blake256_4way_init( &l2v3_4way_ctx.blake );
|
||||
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||
blake256_4x32_init( &l2v3_4way_ctx.blake );
|
||||
blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
#define LYRA2Z_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY 1
|
||||
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
|
||||
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2z_4way_blake_mid;
|
||||
static __thread blake256_4x32_context l2z_4way_blake_mid;
|
||||
|
||||
void lyra2z_4way_midstate( const void* input )
|
||||
{
|
||||
blake256_4way_init( &l2z_4way_blake_mid );
|
||||
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
|
||||
blake256_4x32_init( &l2z_4way_blake_mid );
|
||||
blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
||||
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
// blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
/*
|
||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
*/
|
||||
blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
block_buf[15] = v128_32( 640 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||
// gate->hash = (void*)&lyra2z_16way_hash;
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
// gate->hash = (void*)&lyra2z_8way_hash;
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "lyra2.h"
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__) && defined(SIMD512)
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#elif defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
@@ -27,7 +27,7 @@
|
||||
#include "lyra2.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
|
@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] =
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
#define G2W_4X64(a,b,c,d) \
|
||||
#define G2W(a,b,c,d) \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
|
||||
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
G2W( s0, s1, s2, s3 ); \
|
||||
s0 = mm512_shufll256_64( s0 ); \
|
||||
s3 = mm512_swap256_128( s3); \
|
||||
s3 = mm512_swap256_128( s3 ); \
|
||||
s2 = mm512_shuflr256_64( s2 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
G2W( s0, s1, s2, s3 ); \
|
||||
s0 = mm512_shuflr256_64( s0 ); \
|
||||
s3 = mm512_swap256_128( s3 ); \
|
||||
s2 = mm512_shufll256_64( s2 );
|
||||
|
||||
/*
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm512_shufll256_64( s3 ); \
|
||||
s1 = mm512_shuflr256_64( s1); \
|
||||
s2 = mm512_swap256_128( s2 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm512_shuflr256_64( s3 ); \
|
||||
s1 = mm512_shufll256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 );
|
||||
*/
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define G_4X64(a,b,c,d) \
|
||||
#define G_AVX2(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
// Pivot about s1 instead of s0 reduces latency.
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
G_AVX2( s0, s1, s2, s3 ); \
|
||||
s0 = mm256_shufll_64( s0 ); \
|
||||
s3 = mm256_swap_128( s3); \
|
||||
s3 = mm256_swap_128( s3 ); \
|
||||
s2 = mm256_shuflr_64( s2 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
G_AVX2( s0, s1, s2, s3 ); \
|
||||
s0 = mm256_shuflr_64( s0 ); \
|
||||
s3 = mm256_swap_128( s3 ); \
|
||||
s2 = mm256_shufll_64( s2 );
|
||||
|
||||
/*
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm256_shufll_64( s3 ); \
|
||||
s1 = mm256_shuflr_64( s1); \
|
||||
s2 = mm256_swap_128( s2 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm256_shuflr_64( s3 ); \
|
||||
s1 = mm256_shufll_64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 );
|
||||
*/
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -148,29 +124,29 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
#define G_128(a,b,c,d) \
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a), 32 ); \
|
||||
d = v128_ror64xor( d, a, 32 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 24 ); \
|
||||
b = v128_ror64xor( b, c, 24 ); \
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
||||
d = v128_ror64xor( d, a, 16 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 63 );
|
||||
b = v128_ror64xor( b, c, 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
v128u64_t t; \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
G_128( s0, s2, s4, s6 ); \
|
||||
G_128( s1, s3, s5, s7 ); \
|
||||
t = v128_alignr64( s7, s6, 1 ); \
|
||||
s6 = v128_alignr64( s6, s7, 1 ); \
|
||||
s7 = t; \
|
||||
t = v128_alignr64( s2, s3, 1 ); \
|
||||
s2 = v128_alignr64( s3, s2, 1 ); \
|
||||
s3 = t; \
|
||||
G_2X64( s0, s2, s5, s6 ); \
|
||||
G_2X64( s1, s3, s4, s7 ); \
|
||||
G_128( s0, s2, s5, s6 ); \
|
||||
G_128( s1, s3, s4, s7 ); \
|
||||
t = v128_alignr64( s6, s7, 1 ); \
|
||||
s6 = v128_alignr64( s7, s6, 1 ); \
|
||||
s7 = t; \
|
||||
@@ -195,10 +171,6 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#define G( r, i, a, b, c, d ) \
|
||||
{ \
|
||||
a = a + b; \
|
||||
@@ -222,7 +194,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(SIMD512)
|
||||
|
||||
union _ovly_512
|
||||
{
|
||||
|
@@ -19,7 +19,7 @@
|
||||
#define EPS1 DBL_EPSILON
|
||||
#define EPS2 3.0e-11
|
||||
|
||||
inline double exp_n( double xt )
|
||||
static inline double exp_n( double xt )
|
||||
{
|
||||
if ( xt < -700.0 )
|
||||
return 0;
|
||||
@@ -31,7 +31,8 @@ inline double exp_n( double xt )
|
||||
return exp( xt );
|
||||
}
|
||||
|
||||
inline double exp_n2( double x1, double x2 )
|
||||
/*
|
||||
static inline double exp_n2( double x1, double x2 )
|
||||
{
|
||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
||||
p5 = 37., p6 = 700.;
|
||||
@@ -51,6 +52,7 @@ inline double exp_n2( double x1, double x2 )
|
||||
else if ( xt > p6 - 1.e-200 )
|
||||
return 0.;
|
||||
}
|
||||
*/
|
||||
|
||||
double swit2_( double wvnmb )
|
||||
{
|
||||
@@ -298,7 +300,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
|
||||
|
||||
bool register_m7m_algo( algo_gate_t *gate )
|
||||
{
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->optimizations = SHA256_OPT;
|
||||
init_m7m_ctx();
|
||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||
@@ -309,4 +311,3 @@ bool register_m7m_algo( algo_gate_t *gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
@@ -1,75 +0,0 @@
|
||||
// Copyright (c) 2014 The Magi developers
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
|
||||
#include <iostream>
|
||||
#include <cfloat>
|
||||
#include <limits>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "magimath.h"
|
||||
|
||||
#define EPS1 (std::numeric_limits<double>::epsilon())
|
||||
#define EPS2 3.0e-11
|
||||
|
||||
static void gauleg(double x1, double x2, double x[], double w[], const int n)
|
||||
{
|
||||
int m,j,i;
|
||||
double z1, z, xm, xl, pp, p3, p2, p1;
|
||||
m=(n+1)/2;
|
||||
xm=0.5*(x2+x1);
|
||||
xl=0.5*(x2-x1);
|
||||
for (i=1;i<=m;i++) {
|
||||
z=cos(3.141592654*(i-0.25)/(n+0.5));
|
||||
do {
|
||||
p1=1.0;
|
||||
p2=0.0;
|
||||
for (j=1;j<=n;j++) {
|
||||
p3=p2;
|
||||
p2=p1;
|
||||
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
|
||||
}
|
||||
pp=n*(z*p1-p2)/(z*z-1.0);
|
||||
z1=z;
|
||||
z=z1-p1/pp;
|
||||
} while (fabs(z-z1) > EPS2);
|
||||
x[i]=xm-xl*z;
|
||||
x[n+1-i]=xm+xl*z;
|
||||
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
|
||||
w[n+1-i]=w[i];
|
||||
}
|
||||
}
|
||||
|
||||
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
|
||||
{
|
||||
double s=0.0;
|
||||
#ifdef _MSC_VER
|
||||
#define SW_DIVS 23
|
||||
double x[SW_DIVS+1], w[SW_DIVS+1];
|
||||
#else
|
||||
double x[NptGQ+1], w[NptGQ+1];
|
||||
#endif
|
||||
|
||||
gauleg(a2, b2, x, w, NptGQ);
|
||||
|
||||
for (int j=1; j<=NptGQ; j++) {
|
||||
s += w[j]*func(x[j]);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static double swit_(double wvnmb)
|
||||
{
|
||||
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
|
||||
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
|
||||
}
|
||||
|
||||
uint32_t sw_(int nnounce, int divs)
|
||||
{
|
||||
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
|
||||
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
|
||||
}
|
@@ -1,54 +0,0 @@
|
||||
// Copyright (c) 2014 The Magi developers
|
||||
// Distributed under the MIT/X11 software license, see the accompanying
|
||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||
#ifndef MAGI_MATH_H
|
||||
#define MAGI_MATH_H
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
uint32_t sw_(int nnounce, int divs);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
inline double exp_n(double xt)
|
||||
{
|
||||
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
|
||||
if(xt < p1)
|
||||
return 0;
|
||||
else if(xt > p6)
|
||||
return 1e200;
|
||||
else if(xt > p3 && xt < p4)
|
||||
return (1.0 + xt);
|
||||
else
|
||||
return exp(xt);
|
||||
}
|
||||
|
||||
// 1 / (1 + exp(x1-x2))
|
||||
inline double exp_n2(double x1, double x2)
|
||||
{
|
||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
|
||||
double xt = x1 - x2;
|
||||
if (xt < p1+1.e-200)
|
||||
return 1.;
|
||||
else if (xt > p1 && xt < p2 + 1.e-200)
|
||||
return ( 1. - exp(xt) );
|
||||
else if (xt > p2 && xt < p3 + 1.e-200)
|
||||
return ( 1. / (1. + exp(xt)) );
|
||||
else if (xt > p3 && xt < p4)
|
||||
return ( 1. / (2. + xt) );
|
||||
else if (xt > p4 - 1.e-200 && xt < p5)
|
||||
return ( exp(-xt) / (1. + exp(-xt)) );
|
||||
else if (xt > p5 - 1.e-200 && xt < p6)
|
||||
return ( exp(-xt) );
|
||||
else //if (xt > p6 - 1.e-200)
|
||||
return 0.;
|
||||
}
|
||||
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user