Compare commits

...

18 Commits

Author SHA1 Message Date
Jay D Dee
12480a3ea5 v25.6 2025-07-20 19:43:10 -04:00
Jay D Dee
aa47e880d5 v25.5 2025-07-09 01:32:38 -04:00
Jay D Dee
66191db93c v25.4 2025-06-20 20:31:41 -04:00
Jay D Dee
dd99580a4c v25.3 2025-01-16 12:31:53 -05:00
Jay D Dee
1ed18bf22e v25.2 2025-01-12 18:58:21 -05:00
Jay D Dee
1d9341ee92 v25.1 2024-12-30 21:33:04 -05:00
Jay D Dee
a45a333b40 v24.8 2024-12-25 23:12:29 -05:00
Jay D Dee
2b1037a7c7 v24.7 2024-12-16 19:17:19 -05:00
Jay D Dee
06624a0ff2 v24.6 2024-12-08 11:14:08 -05:00
Jay D Dee
8e91bfbe19 v24.5 2024-09-13 14:14:57 -04:00
Jay D Dee
47e24b50e8 v24.4 2024-07-01 00:33:19 -04:00
Jay D Dee
c47c4a8885 v24.3 2024-05-28 18:20:19 -04:00
Jay D Dee
042d13d1e1 v24.2 2024-05-20 23:08:50 -04:00
Jay D Dee
4f930574cc v24.1 2024-04-16 21:31:35 -04:00
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
Jay D Dee
045b42babf v23.13 2023-11-21 14:18:15 -05:00
Jay D Dee
fc696dbbe5 v23.12 2023-11-20 11:51:57 -05:00
253 changed files with 10654 additions and 12912 deletions

View File

@@ -1,27 +1,49 @@
if WANT_JANSSON
JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
if HAVE_APPLE
# MacOS uses Homebrew to install needed packages but they aren't linked for
# the jansson test in configure. Ignore the failed test & link them now,
# different path for different CPU arch.
if ARCH_ARM64
EXTRA_INCLUDES = -I/opt/homebrew/include
EXTRA_LIBS = -L/opt/homebrew/lib
else
JANSSON_INCLUDES=
EXTRA_INCLUDES = -I/usr/local/include
EXTRA_LIBS = -L/usr/local/lib
endif
EXTRA_DIST = example-cfg.json nomacro.pl
else
SUBDIRS = compat
if WANT_JANSSON
# Can't find jansson libraries, compile the included source code.
EXTRA_INCLUDES = -I$(top_srcdir)/compat/jansson
EXTRA_LIBS = -L$(top_srcdir)/compat/jansson
else
EXTRA_INCLUDES =
EXTRA_LIBS =
endif
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
endif
bin_PROGRAMS = cpuminer
EXTRA_DIST = example-cfg.json nomacro.pl
dist_man_MANS = cpuminer.1
SUBDIRS = compat
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(EXTRA_INCLUDES) -I.
bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
cpu-miner.c \
util.c \
api.c \
sysinfos.c \
algo-gate-api.c\
malloc-huge.c \
simd-utils/simd-constants.c \
algo/argon2d/argon2d-gate.c \
algo/argon2d/blake2/blake2b.c \
algo/argon2d/argon2d/argon2.c \
@@ -113,7 +135,6 @@ cpuminer_SOURCES = \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m/m7m.c \
algo/m7m/magimath.cpp \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -166,9 +187,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/shavite/shavite.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
@@ -250,6 +268,7 @@ cpuminer_SOURCES = \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x20r.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x16/minotaur.c \
@@ -274,29 +293,29 @@ cpuminer_SOURCES = \
algo/yespower/yespower-opt.c \
algo/yespower/yespower-ref.c \
algo/yespower/yespower-blake2b-ref.c
disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags += -DNOASM
endif
if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if USE_ASM
disable_flags =
cpuminer_SOURCES += asm/neoscrypt_asm.S
else
disable_flags = -DNOASM
endif
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = $(EXTRA_LIBS) @LIBCURL@ -ljansson @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if ARCH_ARM64
cpuminer_CFLAGS += -flax-vector-conversions
endif
if HAVE_WINDOWS
# use to profile an object
# gprof_cflags = -pg -g3
# cpuminer_LDFLAGS += -pg
@@ -310,5 +329,4 @@ cpuminer-neoscrypt.o: neoscrypt.c
@echo "CUSTOM ${@}: ${filter %.o,${^}} ${filter %.c,${^}}"
$(CC) $(common_ccflags) -g -O3 $(gprof_cflags) -MT $@ -MD -MP -c -o $@ $<
endif

View File

@@ -36,44 +36,28 @@ for compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. Further optimizations are available
on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.
32 bit CPUs are not supported.
Other CPU architectures such as ARM, Raspberry Pi, RISC-V, Xeon Phi, etc,
are not supported.
1. A 64 bit CPU supporting x86_64 (Intel or AMD) or aarch64 (ARM).
x86_64 requires SSE2, aarch64 requires armv8 & NEON.
Mobile CPUs like laptop computers are not recommended because they aren't
designed for extreme heat of operating at full load for extended periods of
time.
Older CPUs and ARM architecture may be supported by cpuminer-multi by TPruvot.
2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
including Mint and Centos, are known to work and have all dependencies
in their repositories. Others may work but may require more effort. Older
versions such as Centos 6 don't work due to missing features.
Windows 7 or newer is supported with mingw_w64 and msys or using the pre-built
binaries. WindowsXP 64 bit is YMMV.
FreeBSD is not actively tested but should work, YMMV.
MacOS, OSx and Android are not supported.
2. 64 bit operating system including Linux, Windows, MacOS, or BSD.
Android, IOS and alt OSs like Haiku & ReactOS are not supported.
3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
RPC getwork using http:// or https://.
GBT is YMMV.
RPC getblocktemplate using http:// or https://.
Supported Algorithms
--------------------
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250 argon2d-crds, Credits (CRDS)
argon2d500 argon2d-dyn, Dynamic (DYN)
argon2d4096 argon2d-uis, Unitus, (UIS)
argon2d250
argon2d500
argon2d1000
argon2d4096
blake Blake-256
blake2b Blake2-512
blake2s Blake2-256
@@ -87,7 +71,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +98,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +130,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
Requirements
------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
- Arm CPU supporting AArch64 and NEON.
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
are not supported. FreeBSD YMMV.
32 bit CPUs are not supported.
ARM requirements (Beta):
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
CPU: Armv8 and NEON, SHA2 & AES are optional
OS: Linux distribution built for AArch64.
Packages: source code only.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
If a mobile CPU can mine it any CPU can.
See wiki for details.
@@ -73,12 +75,136 @@ If not what makes it happen or not happen?
Change Log
----------
v25.6
Added argon2d1000, argon2d16000 algos.
Target specific AES optimizations improve shavite for ARM64 & x86_64.
v25.5
x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
Other small bug fixes.
v25.4
x86_64: improved handling of vector constants used for byte permutations.
x86_64: removed hooks for cancelled AVX10-256.
Minor bug fixes & improvements.
More code cleanup.
v25.3
#442, #443: Fixed a regression in Makefile.am.
Removed algo features log display.
Some code cleanup.
v25.2
ARM: Fixed regression from v25.1 that could cause build fail.
BSD: FreeBSD is now supported. Other BSDs may also work.
MacOS: build with installed jansson library instead of compiling the included source code.
Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11.
Changed build.sh shell from bash to sh.
v25.1
MacOS ARM64: m7m algo is now working.
MacOS ARM64: can now be compiled with GCC.
MacOS x86_64: is now working compiled with GCC.
Fixed some minor bugs & removed some obsolete code.
v24.8
ARM: Apple MacOS on M series CPU is now supported compiled from source
code, see Wiki for details.
ARM: Fix incorrect compiler version display when using clang.
build.sh can now be used to compile all targets, arm_build.sh & build_msys2.sh
have been removed.
Windows: MSys2 build now enables CPU groups by default, prebuilt binaries
continue to be compiled with CPU groups disabled.
v24.7
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
v24.6
ARM: Fixed scryptn2, x16*, broken in v24.2.
ARM: Small improvement to interleaving.
Eliminated some potential compile errors in code that was dependent on
compiler optimisations.
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
v24.5
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
#427: GBT: Improved handling of new work.
Removed shavite3 algo.
v24.4
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
x86_64: fixed a bug in alignr macros for SSE2.
ARM: CPU feature reporting enhancements.
Some code cleanup.
v24.3
ARM: CPU feature detection and reporting is now working.
ARM: Verthash is now working.
ARM: Small speedup for yescrypt, yespower & argon2d.
Code cleanup.
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.
v24.1
#414: fix bug in merkle error handling.
#416: change $nproc to $(nproc) in build scripts.
#420: change some inline function definitions to static inline.
#413: Fix formatting error for share result log when using no-color.
Faster 2 way interleaving.
Cleanup sha256 architecture targetting.
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.
Eliminated redundant hash order calculations for x16r family.
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
v23.11
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
v23.10
x86_64: Fixed scrypt, scryptn2 algos SSE2.
Fixed sha512d256d algo AVX2, SSE2, NEON.
Fixed sha512256d algo AVX2, SSE2, NEON.
Fixed a bug in Skein N-way that reduced performance.
ARM: Skein algo optimized for NEON & SHA2.
ARM: Skein optimized for NEON, SHA2 & SSE2.
Skein2 algo 2-way optimized for NEON & SSE2.
v23.9

View File

@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
gate->set_work_data_endian = (void*)&do_nothing;
gate->resync_threads = (void*)&do_nothing;
gate->do_this_thread = (void*)&return_true;
// gate->resync_threads = (void*)&do_nothing;
// gate->do_this_thread = (void*)&return_true;
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
gate->get_work_data_size = (void*)&std_get_work_data_size;
gate->optimizations = EMPTY_SET;
@@ -295,8 +295,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
{
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
case ALGO_ARGON2D1000: rc = register_argon2d1000_algo ( gate ); break;
case ALGO_ARGON2D16000: rc = register_argon2d16000_algo ( gate ); break;
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;
@@ -340,7 +342,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
@@ -368,6 +369,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
case ALGO_X17: rc = register_x17_algo ( gate ); break;
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
@@ -416,8 +418,6 @@ void exec_hash_function( int algo, void *output, const void *pdata )
const char* const algo_alias_map[][2] =
{
// alias proper
{ "argon2d-dyn", "argon2d500" },
{ "argon2d-uis", "argon2d4096" },
{ "bcd", "x13bcd" },
{ "bitcore", "timetravel10" },
{ "bitzeny", "yescryptr8" },

View File

@@ -98,25 +98,27 @@ typedef uint32_t set_t;
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
// return set contained common elements from sets a & b
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
// all elements in set a are included in set b
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
// no elements in set a are included in set b
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{
@@ -163,15 +165,18 @@ char* ( *malloc_txs_request ) ( struct work* );
void ( *set_work_data_endian ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int );
//bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( int, struct work* );
//void ( *resync_threads ) ( int, struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
// Deprecated
set_t optimizations;
int ( *get_work_data_size ) ();
int ntime_index;
int nbits_index;
int nonce_index; // use with caution, see warning below
@@ -246,7 +251,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
@@ -272,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
// OpenSSL sha256 deprecated
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
bool std_le_work_decode( struct work *work );
bool std_be_work_decode( struct work *work );

View File

@@ -6,9 +6,39 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
// Credits
// generic, works with most variations of argon2d
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
void argon2d_crds_hash( void *output, const void *input )
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
algo_gate.hash( hash, edata, thr_id );
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
void argon2d250_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -34,48 +64,15 @@ void argon2d_crds_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
bool register_argon2d250_algo( algo_gate_t* gate )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
swab32_array( edata, pdata, 20 );
do {
be32enc(&edata[19], nonce);
argon2d_crds_hash( hash, edata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool register_argon2d_crds_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_crds;
gate->hash = (void*)&argon2d_crds_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d250_hash;
opt_target_factor = 65536.0;
return true;
}
// Dynamic
void argon2d_dyn_hash( void *output, const void *input )
void argon2d500_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
@@ -101,48 +98,81 @@ void argon2d_dyn_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
bool register_argon2d500_algo( algo_gate_t* gate )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
argon2d_dyn_hash( hash, edata );
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !bench ) )
{
pdata[19] = bswap_32( nonce );;
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
bool register_argon2d_dyn_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d_dyn;
gate->hash = (void*)&argon2d_dyn_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d500_hash;
opt_target_factor = 65536.0;
return true;
}
// Unitus
void argon2d1000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 1000; // Memory in KiB (1MB)
context.lanes = 8; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 2; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d1000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d1000_hash;
opt_target_factor = 65536.0;
return true;
}
void argon2d16000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 16000; // Memory in KiB (~16384KB)
context.lanes = 1; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 1; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d16000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d16000_hash;
opt_target_factor = 65536.0;
return true;
}
int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -154,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t n = first_nonce;
const int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
uint32_t t_cost = 1; // 1 iteration
uint32_t m_cost = 4096; // use 4MB
uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -182,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
bool register_argon2d4096_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d4096;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
opt_target_factor = 65536.0;
return true;
}

View File

@@ -4,22 +4,27 @@
#include "algo-gate-api.h"
#include <stdint.h>
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d_crds_algo( algo_gate_t* gate );
void argon2d_crds_hash( void *state, const void *input );
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d250_algo( algo_gate_t* gate );
void argon2d250_hash( void *state, const void *input );
// Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d_dyn_algo( algo_gate_t* gate );
bool register_argon2d500_algo( algo_gate_t* gate );
void argon2d_dyn_hash( void *state, const void *input );
void argon2d500_hash( void *state, const void *input );
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
bool register_argon2d1000_algo( algo_gate_t* gate );
void argon2d1000_hash( void *state, const void *input );
bool register_argon2d16000_algo( algo_gate_t* gate );
void argon2d16000_hash( void *state, const void *input );
// Unitus: version = 0x13, m_cost = 4096.
bool register_argon2d4096_algo( algo_gate_t* gate );

View File

@@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/
#if defined(__AVX512F__)
#if defined(SIMD512)
static inline __m512i blamka( __m512i x, __m512i y )
{
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];

View File

@@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(SIMD512)
#if !defined(__AVX2__)

View File

@@ -6,15 +6,15 @@
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_4w_ctx;
blake256r14_4x32_context blake_4w_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
blake256r14_4x32_context ctx;
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
blake256r14_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -35,11 +35,11 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
blake256r14_4x32_init( &blake_4w_ctx );
blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
#if defined(BLAKE_8WAY)
blake256r14_8way_context blake_8w_ctx;
blake256r14_8x32_context blake_8w_ctx;
void blakehash_8way( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r14_8way_context ctx;
blake256r14_8x32_context ctx;
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
blake256r14_8way( &ctx, input + (64<<3), 16 );
blake256r14_8way_close( &ctx, vhash );
blake256r14_8x32( &ctx, input + (64<<3), 16 );
blake256r14_8x32_close( &ctx, vhash );
_dintrlv_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256r14_8way_init( &blake_8w_ctx );
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
blake256r14_8x32_init( &blake_8w_ctx );
blake256r14_8x32( &blake_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,

View File

@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
(state)->T1 = T1; \
} while (0)
#if defined(__SSSE3__)
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
}
#else // SSE2
#define BLAKE256_4X32_BLOCK_BSWAP32 \
{ \
M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
MF = v128_bswap32( buf[15] ); \
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4X32( rounds ) \
{ \
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
ROUND_S_4X32_3;
}
#if defined(__SSSE3__)
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
#else
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
#endif
}
#if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
VD = v256_32( T0 ^ 0x299F31D0 ); \
VE = v256_32( T1 ^ 0x082EFA98 ); \
VF = v256_32( T1 ^ 0xEC4E6C89 ); \
const __m256i shuf_bswap32 = mm256_set2_64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+ 1) ); \
M2 = mm256_bswap_32( *(buf+ 2) ); \
M3 = mm256_bswap_32( *(buf+ 3) ); \
M4 = mm256_bswap_32( *(buf+ 4) ); \
M5 = mm256_bswap_32( *(buf+ 5) ); \
M6 = mm256_bswap_32( *(buf+ 6) ); \
M7 = mm256_bswap_32( *(buf+ 7) ); \
M8 = mm256_bswap_32( *(buf+ 8) ); \
M9 = mm256_bswap_32( *(buf+ 9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
H7 = mm256_xor3( VF, V7, H7 ); \
}
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m256i *H = (__m256i*)final_hash;
@@ -1596,22 +1547,19 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_8WAY_3;
}
const __m256i shuf_bswap32 =
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////////
//
@@ -1933,8 +1881,6 @@ do { \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
VD = v512_32( T0 ^ 0x299F31D0 ); \
VE = v512_32( T1 ^ 0x082EFA98 ); \
VF = v512_32( T1 ^ 0xEC4E6C89 ); \
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
M0 = mm512_bswap_32( * buf ); \
M1 = mm512_bswap_32( *(buf+ 1) ); \
M2 = mm512_bswap_32( *(buf+ 2) ); \
M3 = mm512_bswap_32( *(buf+ 3) ); \
M4 = mm512_bswap_32( *(buf+ 4) ); \
M5 = mm512_bswap_32( *(buf+ 5) ); \
M6 = mm512_bswap_32( *(buf+ 6) ); \
M7 = mm512_bswap_32( *(buf+ 7) ); \
M8 = mm512_bswap_32( *(buf+ 8) ); \
M9 = mm512_bswap_32( *(buf+ 9) ); \
MA = mm512_bswap_32( *(buf+10) ); \
MB = mm512_bswap_32( *(buf+11) ); \
MC = mm512_bswap_32( *(buf+12) ); \
MD = mm512_bswap_32( *(buf+13) ); \
ME = mm512_bswap_32( *(buf+14) ); \
MF = mm512_bswap_32( *(buf+15) ); \
ROUND_S_16WAY(0); \
ROUND_S_16WAY(1); \
ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
// is constant for every nonce and only needs to be run once per job. The
// second part is run for each nonce using the precalculated midstate and the
// hash from the first block.
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data )
{
__m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
}
// Dfault is 14 rounds, blakecoin & vanilla are 8.
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds )
{
__m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
}
// Byte swap final hash
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
}
#endif
// Blake-256 4 way
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
static void
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
int rounds )
{
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
// Blake-256 8 way
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static void
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
}
static void
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
}
static void
blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m256i buf[16];
@@ -2617,13 +2558,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//Blake-256 16 way AVX512
static void
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
int rounds )
{
casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
}
static void
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
sc->ptr = ptr;
}
static void
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
static void
blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
__m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
}
static void
blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
}
void
blake256_16way_init(void *cc)
blake256_16x32_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256_16way_update(void *cc, const void *data, size_t len)
blake256_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256_16way_close(void *cc, void *dst)
blake256_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void
blake256_16way_update_le(void *cc, const void *data, size_t len)
blake256_16x32_update_le(void *cc, const void *data, size_t len)
{
blake32_16way_le(cc, data, len);
}
void
blake256_16way_close_le(void *cc, void *dst)
blake256_16x32_close_le(void *cc, void *dst)
{
blake32_16way_close_le(cc, 0, 0, dst, 8);
}
void blake256r14_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_16way_init( cc, IV256, 14 );
}
void
blake256r14_16way_update(void *cc, const void *data, size_t len)
blake256r14_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r14_16way_close(void *cc, void *dst)
blake256r14_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
void blake256r8_16way_init(void *cc)
{
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_16way_init( cc, IV256, 8 );
}
void
blake256r8_16way_update(void *cc, const void *data, size_t len)
blake256r8_16x32_update(void *cc, const void *data, size_t len)
{
blake32_16way(cc, data, len);
}
void
blake256r8_16way_close(void *cc, void *dst)
blake256r8_16x32_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
void
blake256_4x32_init(void *ctx)
{
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( ctx, IV256, 14 );
}
void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
// Blake-256 8 way
void
blake256_8way_init(void *cc)
blake256_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256_8way_update(void *cc, const void *data, size_t len)
blake256_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256_8way_close(void *cc, void *dst)
blake256_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
void
blake256_8way_update_le(void *cc, const void *data, size_t len)
blake256_8x32_update_le(void *cc, const void *data, size_t len)
{
blake32_8way_le(cc, data, len);
}
void
blake256_8way_close_le(void *cc, void *dst)
blake256_8x32_close_le(void *cc, void *dst)
{
blake32_8way_close_le(cc, 0, 0, dst, 8);
}
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
// 14 rounds Blake, Decred
void blake256r14_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
blake32_4x32_init( cc, IV256, 14 );
}
void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)
#if defined(__AVX2__)
void blake256r14_8way_init(void *cc)
void blake256r14_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
blake32_8way_init( cc, IV256, 14 );
}
void
blake256r14_8way_update(void *cc, const void *data, size_t len)
blake256r14_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r14_8way_close(void *cc, void *dst)
blake256r14_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
// 8 rounds Blakecoin, Vanilla
void blake256r8_4x32_init(void *cc)
{
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
blake32_4x32_init( cc, IV256, 8 );
}
void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)
#if defined (__AVX2__)
void blake256r8_8way_init(void *cc)
void blake256r8_8x32_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
blake32_8way_init( cc, IV256, 8 );
}
void
blake256r8_8way_update(void *cc, const void *data, size_t len)
blake256r8_8x32_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r8_8way_close(void *cc, void *dst)
blake256r8_8x32_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}

View File

@@ -29,13 +29,6 @@ typedef struct
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds );
/*
void blake256_init( blake256_context *sc );
void blake256_update( blake256_context *sc, const void *data, size_t len );
void blake256_close( blake256_context *sc, void *dst );
void blake256_full( blake256_context *sc, void *dst, const void *data,
size_t len );
*/
//////////////////////////////////
//
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
void blake256_4x32_init(void *ctx);
void blake256_4x32_update(void *ctx, const void *data, size_t len);
void blake256_4x32_close(void *ctx, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds
typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
void blake256r8_4x32_close(void *cc, void *dst);
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
#define blake_4way_small_context blake256_4x32_context
#define blake256_4way_context blake256_4x32_context
#define blake256_4way_init blake256_4x32_init
#define blake256_4way_update blake256_4x32_update
#define blake256_4way_close blake256_4x32_close
#define blake256_4way_update_le blake256_4x32_update_le
#define blake256_4way_close_le blake256_4x32_close_le
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
#define blake256r14_4way_context blake256r14_4x32_context
#define blake256r14_4way_init blake256r14_4x32_init
#define blake256r14_4way_update blake256r14_4x32_update
#define blake256r14_4way_close blake256r14_4x32_close
#define blake256r8_4way_context blake256r14_4x32_context
#define blake256r8_4way_init blake256r14_4x32_init
#define blake256r8_4way_update blake256r14_4x32_update
#define blake256r8_4way_close blake256r14_4x32_close
#ifdef __AVX2__
//////////////////////////////
@@ -107,47 +81,30 @@ typedef struct
} blake_8way_small_context;
// Default 14 rounds
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way_update(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
void blake256_8way_update_le(void *cc, const void *data, size_t len);
void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
typedef blake_8way_small_context blake256_8x32_context;
void blake256_8x32_init(void *cc);
void blake256_8x32_update(void *cc, const void *data, size_t len);
void blake256_8x32_close(void *cc, void *dst);
void blake256_8x32_update_le(void *cc, const void *data, size_t len);
void blake256_8x32_close_le(void *cc, void *dst);
void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
void blake256r14_8way_init(void *cc);
void blake256r14_8way_update(void *cc, const void *data, size_t len);
void blake256r14_8way_close(void *cc, void *dst);
typedef blake_8way_small_context blake256r14_8x32_context;
void blake256r14_8x32_init(void *cc);
void blake256r14_8x32_update(void *cc, const void *data, size_t len);
void blake256r14_8x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_8way_small_context blake256r8_8way_context;
void blake256r8_8way_init(void *cc);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
typedef blake_8way_small_context blake256r8_8x32_context;
void blake256r8_8x32_init(void *cc);
void blake256r8_8x32_update(void *cc, const void *data, size_t len);
void blake256r8_8x32_close(void *cc, void *dst);
#define blake_8x32_small_context blake256_8way_context
#define blake_8x32_init blake256_8way_init
#define blake_8x32_update blake256_8way_update
#define blake_8x32_close blake256_8way_close
#define blake_8x32_update_le blake256_8way_update_le
#define blake_8x32_close_le blake256_8way_close_le
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
#define blake256r14_8x32_context blake256r14_8way_context
#define blake256r14_8x32_init blake256r14_8way_init
#define blake256r14_8x32_update blake256r14_8way_update
#define blake256r14_8x32_close blake256r14_8way_close
#define blake256r8_8x32_context blake256r14_8way_context
#define blake256r8_8x32_init blake256r14_8way_init
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////
//
@@ -163,46 +120,29 @@ typedef struct
} blake_16way_small_context __attribute__ ((aligned (128)));
// Default 14 rounds
typedef blake_16way_small_context blake256_16way_context;
void blake256_16way_init(void *cc);
void blake256_16way_update(void *cc, const void *data, size_t len);
void blake256_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256_16x32_context;
void blake256_16x32_init(void *cc);
void blake256_16x32_update(void *cc, const void *data, size_t len);
void blake256_16x32_close(void *cc, void *dst);
// Expects data in little endian order, no byte swap needed
void blake256_16way_update_le(void *cc, const void *data, size_t len);
void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void blake256_16x32_update_le(void *cc, const void *data, size_t len);
void blake256_16x32_close_le(void *cc, void *dst);
void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_16way_small_context blake256r14_16way_context;
void blake256r14_16way_init(void *cc);
void blake256r14_16way_update(void *cc, const void *data, size_t len);
void blake256r14_16way_close(void *cc, void *dst);
typedef blake_16way_small_context blake256r14_16x32_context;
void blake256r14_16x32_init(void *cc);
void blake256r14_16x32_update(void *cc, const void *data, size_t len);
void blake256r14_16x32_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_16way_small_context blake256r8_16way_context;
void blake256r8_16way_init(void *cc);
void blake256r8_16way_update(void *cc, const void *data, size_t len);
void blake256r8_16way_close(void *cc, void *dst);
#define blake_16x32_small_context blake256_16way_context
#define blake_16x32_init blake256_16way_init
#define blake_16x32_update blake256_16way_update
#define blake_16x32_close blake256_16way_close
#define blake_16x32_update_le blake256_16way_update_le
#define blake_16x32_close_le blake256_16way_close_le
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
#define blake256r14_16x32_context blake256r14_16way_context
#define blake256r14_16x32_init blake256r14_16way_init
#define blake256r14_16x32_update blake256r14_16way_update
#define blake256r14_16x32_close blake256r14_16way_close
#define blake256r8_16x32_context blake256r8_16way_context
#define blake256r8_16x32_init blake256r8_16way_init
#define blake256r8_16x32_update blake256r8_16way_update
#define blake256r8_16x32_close blake256r8_16way_close
typedef blake_16way_small_context blake256r8_16x32_context;
void blake256r8_16x32_init(void *cc);
void blake256r8_16x32_update(void *cc, const void *data, size_t len);
void blake256r8_16x32_close(void *cc, void *dst);
#endif // AVX512
#endif // AVX2

View File

@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define B2B8W_G(a, b, c, d, x, y) \
{ \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
}
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
{
__m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
{
size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
}
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen )
{
__m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_8way_compress( ctx, 0 );
blake2b_8x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
}
}
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
casti_m512i( out, 0 ) = ctx->h[0];
casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
{
__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
{
size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
blake2b_4x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];

View File

@@ -1,6 +1,6 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#ifndef BLAKE2B_HASH_4WAY_H__
#define BLAKE2B_HASH_4WAY_H__
#include "simd-utils.h"
#include <stddef.h>
@@ -14,8 +14,7 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer
@@ -23,12 +22,12 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_8way_ctx;
} blake2b_8x64_ctx;
int blake2b_8way_init( blake2b_8way_ctx *ctx );
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#endif
@@ -41,12 +40,12 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx;
} blake2b_4x64_ctx;
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#endif

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]); // 3*16+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
blake2b_8way_init( &ctx );
blake2b_8way_update( &ctx, vdata, 80 );
blake2b_8way_final( &ctx, hash );
blake2b_8x64_init( &ctx );
blake2b_8x64_update( &ctx, vdata, 80 );
blake2b_8x64_final( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
// Function not used, code inlined.
void blake2b_4way_hash(void *output, const void *input)
{
blake2b_4way_ctx ctx;
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, input, 80 );
blake2b_4way_final( &ctx, output );
blake2b_4x64_ctx ctx;
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, input, 80 );
blake2b_4x64_final( &ctx, output );
}
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[25]); // 3*8+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
blake2b_4way_init( &ctx );
blake2b_4way_update( &ctx, vdata, 80 );
blake2b_4way_final( &ctx, hash );
blake2b_4x64_init( &ctx );
blake2b_4x64_update( &ctx, vdata, 80 );
blake2b_4x64_final( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane<<1 ] <= Htarg )

View File

@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Blake2s-256 16 way

View File

@@ -11,8 +11,8 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#ifndef BLAKE2S_HASH_4WAY_H__
#define BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__) || defined(__ARM_NEON)
@@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
@@ -61,13 +61,18 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_4x32_state blake2s_4way_state
#define blake2s_4x32_init blake2s_4way_init
#define blake2s_4x32_update blake2s_4way_update
#define blake2s_4x32_final blake2s_4way_final
#define blake2s_4x32_full_blocks blake2s_4way_full_blocks
#if defined(__AVX2__)
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -81,14 +86,20 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
const void *input, uint64_t inlen );
#define blake2s_8x32_state blake2s_8way_state
#define blake2s_8x32_init blake2s_8way_init
#define blake2s_8x32_update blake2s_8way_update
#define blake2s_8x32_final blake2s_8way_final
#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
uint64_t inlen );
int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
#define blake2s_16x32_state blake2s_16way_state
#define blake2s_16x32_init blake2s_16way_init
#define blake2s_16x32_update blake2s_16way_update
#define blake2s_16x32_final blake2s_16way_final
#endif
#if 0

View File

@@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY

View File

@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}
#define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
VD = v512_64( CB5 ^ T0 ); \
VE = v512_64( CB6 ^ T1 ); \
VF = v512_64( CB7 ^ T1 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm512_bswap_64( *(buf+ 0) ); \
M1 = mm512_bswap_64( *(buf+ 1) ); \
M2 = mm512_bswap_64( *(buf+ 2) ); \
M3 = mm512_bswap_64( *(buf+ 3) ); \
M4 = mm512_bswap_64( *(buf+ 4) ); \
M5 = mm512_bswap_64( *(buf+ 5) ); \
M6 = mm512_bswap_64( *(buf+ 6) ); \
M7 = mm512_bswap_64( *(buf+ 7) ); \
M8 = mm512_bswap_64( *(buf+ 8) ); \
M9 = mm512_bswap_64( *(buf+ 9) ); \
MA = mm512_bswap_64( *(buf+10) ); \
MB = mm512_bswap_64( *(buf+11) ); \
MC = mm512_bswap_64( *(buf+12) ); \
MD = mm512_bswap_64( *(buf+13) ); \
ME = mm512_bswap_64( *(buf+14) ); \
MF = mm512_bswap_64( *(buf+15) ); \
ROUND_B_8WAY(0); \
ROUND_B_8WAY(1); \
ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
H7 = mm512_xor3( VF, V7, H7 ); \
}
void blake512_8way_compress( blake_8way_big_context *sc )
void blake512_8x64_compress( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
VE = v512_64( CB6 ^ sc->T1 );
VF = v512_64( CB7 ^ sc->T1 );
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm512_bswap_64( sc->buf[ 0] );
M1 = mm512_bswap_64( sc->buf[ 1] );
M2 = mm512_bswap_64( sc->buf[ 2] );
M3 = mm512_bswap_64( sc->buf[ 3] );
M4 = mm512_bswap_64( sc->buf[ 4] );
M5 = mm512_bswap_64( sc->buf[ 5] );
M6 = mm512_bswap_64( sc->buf[ 6] );
M7 = mm512_bswap_64( sc->buf[ 7] );
M8 = mm512_bswap_64( sc->buf[ 8] );
M9 = mm512_bswap_64( sc->buf[ 9] );
MA = mm512_bswap_64( sc->buf[10] );
MB = mm512_bswap_64( sc->buf[11] );
MC = mm512_bswap_64( sc->buf[12] );
MD = mm512_bswap_64( sc->buf[13] );
ME = mm512_bswap_64( sc->buf[14] );
MF = mm512_bswap_64( sc->buf[15] );
ROUND_B_8WAY(0);
ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
}
// won't be used after prehash implemented
void blake512_8way_compress_le( blake_8x64_big_context *sc )
void blake512_8x64_compress_le( blake_8x64_big_context *sc )
{
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
sc->ptr = 0;
}
@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress( sc );
blake512_8x64_compress( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
sc->ptr = 0;
}
@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_8way_compress_le( sc );
blake512_8x64_compress_le( sc );
mm512_block_bswap_64( (__m512i*)dst, sc->H );
}
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
VD = v256_64( CB5 ^ T0 ); \
VE = v256_64( CB6 ^ T1 ); \
VF = v256_64( CB7 ^ T1 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
M0 = mm256_bswap_64( *(buf+ 0) ); \
M1 = mm256_bswap_64( *(buf+ 1) ); \
M2 = mm256_bswap_64( *(buf+ 2) ); \
M3 = mm256_bswap_64( *(buf+ 3) ); \
M4 = mm256_bswap_64( *(buf+ 4) ); \
M5 = mm256_bswap_64( *(buf+ 5) ); \
M6 = mm256_bswap_64( *(buf+ 6) ); \
M7 = mm256_bswap_64( *(buf+ 7) ); \
M8 = mm256_bswap_64( *(buf+ 8) ); \
M9 = mm256_bswap_64( *(buf+ 9) ); \
MA = mm256_bswap_64( *(buf+10) ); \
MB = mm256_bswap_64( *(buf+11) ); \
MC = mm256_bswap_64( *(buf+12) ); \
MD = mm256_bswap_64( *(buf+13) ); \
ME = mm256_bswap_64( *(buf+14) ); \
MF = mm256_bswap_64( *(buf+15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
}
void blake512_4way_compress( blake_4x64_big_context *sc )
void blake512_4x64_compress( blake_4x64_big_context *sc )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
VD = v256_64( CB5 ^ sc->T0 );
VE = v256_64( CB6 ^ sc->T1 );
VF = v256_64( CB7 ^ sc->T1 );
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
M0 = mm256_bswap_64( sc->buf[ 0] );
M1 = mm256_bswap_64( sc->buf[ 1] );
M2 = mm256_bswap_64( sc->buf[ 2] );
M3 = mm256_bswap_64( sc->buf[ 3] );
M4 = mm256_bswap_64( sc->buf[ 4] );
M5 = mm256_bswap_64( sc->buf[ 5] );
M6 = mm256_bswap_64( sc->buf[ 6] );
M7 = mm256_bswap_64( sc->buf[ 7] );
M8 = mm256_bswap_64( sc->buf[ 8] );
M9 = mm256_bswap_64( sc->buf[ 9] );
MA = mm256_bswap_64( sc->buf[10] );
MB = mm256_bswap_64( sc->buf[11] );
MC = mm256_bswap_64( sc->buf[12] );
MD = mm256_bswap_64( sc->buf[13] );
ME = mm256_bswap_64( sc->buf[14] );
MF = mm256_bswap_64( sc->buf[15] );
ROUND_B_4WAY(0);
ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
}
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
const void *data )
{
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
midstate[15] = VF;
}
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
const __m256i nonce, const __m256i *midstate )
{
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
}
void blake512_4x64_init( blake_4x64_big_context *sc )
void blake512_4x64_init( blake512_4x64_context *sc )
{
casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
}
// init, update & close
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
const void *data, size_t len )
{
@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
sc->ptr = 0;
}
@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_4way_compress( sc );
blake512_4x64_compress( sc );
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
@@ -1887,13 +1878,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}
#define ROUND_B_2X64(r) \
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
VE = v128_64( CB6 ^ sc->T1 );
VF = v128_64( CB7 ^ sc->T1 );
#if defined(__SSSE3__)
const v128u64_t shuf_bswap64 = v128_set64(
0x08090a0b0c0d0e0f, 0x0001020304050607 );
M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
#else // SSE2 & NEON
M0 = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 1] );
M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
ME = v128_bswap64( sc->buf[14] );
MF = v128_bswap64( sc->buf[15] );
#endif
ROUND_B_2X64(0);
ROUND_B_2X64(1);
ROUND_B_2X64(2);
@@ -2054,9 +2020,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2103,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );
// Round 1
// G0
@@ -2147,34 +2113,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );
// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );
// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );
// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);

View File

@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////
//

View File

@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
#elif defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
blake256r8_4x32_context blakecoin_4w_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
blake256r8_4x32_context ctx;
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
blake256r8_4x32_close( &ctx, vhash );
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -178,11 +178,11 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
blake256r8_4x32_init( &blakecoin_4w_ctx );
blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY

View File

@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
blake512_4way_context ctx;
blake512_4x64_context ctx;
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, input, 80 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, input, 80 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way_update( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4x64_init( &ctx );
blake512_4x64_update( &ctx, vhash, 64 );
blake512_4x64_close( &ctx, vhash );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );

View File

@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
v128_t *V = (v128_t*)v;
@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
V[3] = v128_swap64( V[3] ); \
V[2] = v128_shufll32( V[2] )
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
#undef BLAKE2S_ROUND
#else
#define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
b = SPH_ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
#define BLAKE2S_ROUND(r) \
do { \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );
ROUND( 4 );
ROUND( 5 );
ROUND( 6 );
ROUND( 7 );
ROUND( 8 );
ROUND( 9 );
#endif
BLAKE2S_ROUND(0);
BLAKE2S_ROUND(1);
BLAKE2S_ROUND(2);
BLAKE2S_ROUND(3);
BLAKE2S_ROUND(4);
BLAKE2S_ROUND(5);
BLAKE2S_ROUND(6);
BLAKE2S_ROUND(7);
BLAKE2S_ROUND(8);
BLAKE2S_ROUND(9);
for( size_t i = 0; i < 8; ++i )
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
#undef BLAKE2S_ROUND
return 0;
}

View File

@@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \

View File

@@ -39,16 +39,14 @@
#include <stddef.h>
#include "simd-utils.h"
#define SPH_SIZE_bmw256 256
#define SPH_SIZE_bmw512 512
// BMW-256 4 way 32
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct
{
v128_t buf[64];
v128_t H[16];
v128u32_t buf[64];
v128u32_t H[16];
size_t ptr;
uint32_t bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
@@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context;
void bmw256_4way_init( bmw256_4way_context *ctx );
void bmw256_4way_update(void *cc, const void *data, size_t len);
#define bmw256_4way bmw256_4way_update
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define bmw256_4x32_context bmw256_4way_context
#define bmw256_4x32_init bmw256_4way_init
#define bmw256_4x32_update bmw256_4way_update
#define bmw256_4x32_close bmw256_4way_close
#endif
#if defined(__AVX2__)
// BMW-256 8 way 32
@@ -85,9 +89,14 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
#define bmw256_8way bmw256_8way_update
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#define bmw256_8x32_context bmw256_8way_context
#define bmw256_8x32_init bmw256_8way_init
#define bmw256_8x32_update bmw256_8way_update
#define bmw256_8x32_close bmw256_8way_close
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32
@@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
size_t len );
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
#define bmw256_16x32_context bmw256_16way_context
#define bmw256_16x32_init bmw256_16way_init
#define bmw256_16x32_update bmw256_16way_update
#define bmw256_16x32_close bmw256_16way_close
#endif
// BMW-512 2 way 64
@@ -157,7 +171,7 @@ void bmw512_4way_addbits_and_close(
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 64 bit 8 way
typedef struct

View File

@@ -45,7 +45,7 @@ extern "C"{
#define LPAR (
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
// BMW-256 4 way 32
/*
@@ -284,9 +284,9 @@ static const uint32_t IV256[] = {
v128_xor( M[13], H[13] ) ) )
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] )
{
v128u64_t qt[32], xl, xh; \
v128u32_t qt[32], xl, xh; \
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
@@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const v128u64_t final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = v128_64( 0x4041424340414243 );
ctx->H[ 1] = v128_64( 0x4445464744454647 );
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = v128_64( 0x5051525350515253 );
ctx->H[ 5] = v128_64( 0x5455565754555657 );
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = v128_64( 0x6061626360616263 );
ctx->H[ 9] = v128_64( 0x6465666764656667 );
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = v128_64( 0x7071727370717273 );
ctx->H[13] = v128_64( 0x7475767774757677 );
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
// sc->H[i] = v128_32( iv[i] );
ctx->H[ 0] = v128_32( 0x40414243 );
ctx->H[ 1] = v128_32( 0x44454647 );
ctx->H[ 2] = v128_32( 0x48494A4B );
ctx->H[ 3] = v128_32( 0x4C4D4E4F );
ctx->H[ 4] = v128_32( 0x50515253 );
ctx->H[ 5] = v128_32( 0x54555657 );
ctx->H[ 6] = v128_32( 0x58595A5B );
ctx->H[ 7] = v128_32( 0x5C5D5E5F );
ctx->H[ 8] = v128_32( 0x60616263 );
ctx->H[ 9] = v128_32( 0x64656667 );
ctx->H[10] = v128_32( 0x68696A6B );
ctx->H[11] = v128_32( 0x6C6D6E6F );
ctx->H[12] = v128_32( 0x70717273 );
ctx->H[13] = v128_32( 0x74757677 );
ctx->H[14] = v128_32( 0x78797A7B );
ctx->H[15] = v128_32( 0x7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
v128u64_t *vdata = (v128u64_t*)data;
v128u64_t *buf;
v128u64_t htmp[16];
v128u64_t *h1, *h2;
v128u32_t *vdata = (v128u32_t*)data;
v128u32_t *buf;
v128u32_t htmp[16];
v128u32_t *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
@@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
ptr += clen;
if ( ptr == buf_size )
{
v128u64_t *ht;
v128u32_t *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
@@ -521,14 +497,14 @@ static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
v128u64_t *buf;
v128u64_t h1[16], h2[16], *h;
v128u32_t *buf;
v128u32_t h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
buf[ ptr>>2 ] = v128_32( 0x00000080 );
ptr += 4;
h = sc->H;
@@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (v128u64_t*)final_s, h1 );
compress_small( buf, (v128u32_t*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_v128( dst, u ) = h1[v];
@@ -1057,7 +1033,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32

View File

@@ -2,12 +2,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
//#include "sph_keccak.h"
#include "bmw-hash-4way.h"
#if defined(BMW512_8WAY)
void bmw512hash_8way(void *state, const void *input)
void bmw512hash_8way( void *state, const void *input )
{
bmw512_8way_context ctx;
bmw512_8way_init( &ctx );
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
const int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
#elif defined(BMW512_4WAY)
//#ifdef BMW512_4WAY
void bmw512hash_4way(void *state, const void *input)
void bmw512hash_4way( void *state, const void *input )
{
bmw512_4way_context ctx;
bmw512_4way_init( &ctx );
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
do {
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input )
{
bmw512_2x64_context ctx;
bmw512_2x64_init( &ctx );
bmw512_2x64_update( &ctx, input, 80 );
bmw512_2x64_close( &ctx, state );
}
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
v128_bswap32_intrlv80_2x64( vdata, pdata );
do {
*noncev = v128_intrlv_blend_32( v128_bswap32(
v128_set32( n+1, 0, n, 0 ) ), *noncev );
bmw512hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -2,7 +2,7 @@
bool register_bmw512_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 256.0;
#if defined (BMW512_8WAY)
gate->scanhash = (void*)&scanhash_bmw512_8way;
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
#elif defined (BMW512_4WAY)
gate->scanhash = (void*)&scanhash_bmw512_4way;
gate->hash = (void*)&bmw512hash_4way;
#elif defined (BMW512_2WAY)
gate->scanhash = (void*)&scanhash_bmw512_2x64;
gate->hash = (void*)&bmw512hash_2x64;
#else
gate->scanhash = (void*)&scanhash_bmw512;
gate->hash = (void*)&bmw512hash;

View File

@@ -4,23 +4,31 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define BMW512_2WAY 1
#endif
#if defined(BMW512_8WAY)
void bmw512hash_8way( void *state, const void *input );
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_4WAY)
void bmw512hash_4way( void *state, const void *input );
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(BMW512_2WAY)
void bmw512hash_2x64( void *state, const void *input );
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else

View File

@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 8 WAY

View File

@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// 4 way 128 is handy to avoid reinterleaving in many algos.
// If reinterleaving is necessary it may be more efficient to use

View File

@@ -6,7 +6,7 @@
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
struct _cube_4way_context
{

View File

@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
int r;
const int rounds = sp->rounds;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
register __m512i x0, x1;
@@ -39,7 +39,7 @@ static void transform( cubehashParam *sp )
#elif defined(__AVX2__)
register __m256i x0, x1, x2, x3, y0, y1;
register __m256i x0, x1, x2, x3, t0;
x0 = _mm256_load_si256( (__m256i*)sp->x );
x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );
@@ -50,10 +50,10 @@ static void transform( cubehashParam *sp )
{
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = mm256_rol_32( x1, 7 );
y1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( y0, x2 );
x1 = _mm256_xor_si256( y1, x3 );
t0 = mm256_rol_32( x1, 7 );
x1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( t0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap128_64( x2 );
x3 = mm256_swap128_64( x3 );
x2 = _mm256_add_epi32( x0, x2 );
@@ -75,7 +75,7 @@ static void transform( cubehashParam *sp )
#else // AVX, SSE2, NEON
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1;
x0 = casti_v128( sp->x, 0 );
x1 = casti_v128( sp->x, 1 );
@@ -92,16 +92,12 @@ static void transform( cubehashParam *sp )
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = v128_rol32( y0, 7 );
x1 = v128_rol32( y1, 7 );
x2 = v128_rol32( y2, 7 );
x3 = v128_rol32( y3, 7 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
t0 = v128_rol32( x2, 7 );
t1 = v128_rol32( x3, 7 );
x2 = v128_rol32( x0, 7 );
x3 = v128_rol32( x1, 7 );
x0 = v128_xor( t0, x4 );
x1 = v128_xor( t1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64( x4 );
@@ -112,19 +108,15 @@ static void transform( cubehashParam *sp )
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = v128_rol32( y0, 11 );
x1 = v128_rol32( y1, 11 );
x2 = v128_rol32( y2, 11 );
x3 = v128_rol32( y3, 11 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64_32( x4 );
t0 = v128_rol32( x1, 11 );
x1 = v128_rol32( x0, 11 );
t1 = v128_rol32( x3, 11 );
x3 = v128_rol32( x2, 11 );
x0 = v128_xor( t0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( t1, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64_32( x4 );
x5 = v128_swap64_32( x5 );
x6 = v128_swap64_32( x6 );
x7 = v128_swap64_32( x7 );

View File

@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
}
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
{
int i, j;
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
return SUCCESS;
}
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
HashReturn update_echo( hashState_echo *state, const void *data,
uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
return SUCCESS;
}
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
HashReturn final_echo( hashState_echo *state, void *hashval)
{
v128_t remainingbits;
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
return SUCCESS;
}
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
return SUCCESS;
}
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength datalen )
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t datalen )
{
int i, j;
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
{
// Fill the buffer
memcpy( state->buffer + state->uBufferBytes,
(void*)data, state->uBlockLength - state->uBufferBytes );
data, state->uBlockLength - state->uBufferBytes );
// Process buffer
Compress( state, state->buffer, 1 );
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
if( uRemainingBytes > 0 )
memcpy(state->buffer, (void*)data, uRemainingBytes);
memcpy(state->buffer, data, uRemainingBytes);
state->uBufferBytes = uRemainingBytes;
}
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
}
#if 0
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
HashReturn hRet;
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif
#endif

View File

@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
HashReturn reinit_echo(hashState_echo *state);
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
HashReturn final_echo(hashState_echo *state, void *hashval);
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
int nHashSize, const BitSequence *data, DataLength databitlen );
HashReturn update_final_echo( hashState_echo *state, void *hashval,
const void *data, uint32_t databitlen );
HashReturn echo_full( hashState_echo *state, void *hashval,
int nHashSize, const void *data, uint32_t databitlen );
#endif // HASH_API_H

View File

@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
};
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -36,7 +36,6 @@
#include "sph_echo.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif
#endif // !AES

View File

@@ -36,8 +36,6 @@
#ifndef SPH_ECHO_H__
#define SPH_ECHO_H__
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
#ifdef __cplusplus
}
#endif
#endif // !AES
#endif

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
for(i = 0; i < 6; i++)
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
static void AddXor512(const void *a,const void *b,void *c)
{
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
casti_m512i( b, 0 ) );
#elif defined(__AVX2__)
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
/*
#define TRANSP_MASK \
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
#define SUBSH_MASK0 \
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
#define SUBSH_MASK1 \
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
#define SUBSH_MASK2 \
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
#define SUBSH_MASK3 \
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
#define SUBSH_MASK4 \
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
#define SUBSH_MASK5 \
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
#define SUBSH_MASK6 \
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
#define SUBSH_MASK7 \
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
c07, c06, c05, c04, c03, c02, c01, c00 ) \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
*/
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -140,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -362,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define GROESTL_4WAY_VAES 1
#endif

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

View File

@@ -43,7 +43,7 @@
#define SIZE256 (SIZE_512/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];

View File

@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{

View File

@@ -33,7 +33,7 @@
#define SIZE512 (SIZE_1024/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];

View File

@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
\
/* AddRoundConstant P1024 */\
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* AddRoundConstant P1024 */\
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\

View File

@@ -17,7 +17,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
sha256_8way_context sha;
sha256_8x32_context sha;
} myrgr_8way_ctx_holder;
myrgr_8way_ctx_holder myrgr_8way_ctx;
@@ -29,7 +29,7 @@ void init_myrgr_8way_ctx()
#else
init_groestl( &myrgr_8way_ctx.groestl, 64 );
#endif
sha256_8way_init( &myrgr_8way_ctx.sha );
sha256_8x32_init( &myrgr_8way_ctx.sha );
}
void myriad_8way_hash( void *output, const void *input )
@@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input )
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha256_8way_update( &ctx.sha, vhash, 64 );
sha256_8way_close( &ctx.sha, output );
sha256_8x32_update( &ctx.sha, vhash, 64 );
sha256_8x32_close( &ctx.sha, output );
}
int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
@@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
typedef struct {
hashState_groestl groestl;
sha256_4way_context sha;
sha256_4x32_context sha;
} myrgr_4way_ctx_holder;
myrgr_4way_ctx_holder myrgr_4way_ctx;
@@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx;
void init_myrgr_4way_ctx()
{
init_groestl (&myrgr_4way_ctx.groestl, 64 );
sha256_4way_init( &myrgr_4way_ctx.sha );
sha256_4x32_init( &myrgr_4way_ctx.sha );
}
void myriad_4way_hash( void *output, const void *input )
@@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way_update( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, output );
sha256_4x32_update( &ctx.sha, vhash, 64 );
sha256_4x32_close( &ctx.sha, output );
}
int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
#endif
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define MYRGR_8WAY 1
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY 1

View File

@@ -35,8 +35,6 @@
#include "sph_groestl.h"
#if !defined(__AES__)
#ifdef __cplusplus
extern "C"{
#endif
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#ifdef __cplusplus
}
#endif // !AES
#endif

View File

@@ -42,7 +42,6 @@ extern "C"{
#include <stddef.h>
#include "compat/sph_types.h"
#if !defined(__AES__)
/**
* Output size (in bits) for Groestl-224.
*/
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
}
#endif
#endif // !AES
#endif

View File

@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi 8 way AVX512
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
@@ -1061,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
WRITE_STATE_BIG8( sc );
}
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
{
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
@@ -1073,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
void hamsi512_8x64_init( hamsi512_8x64_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
@@ -1089,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
sc->h[7] = v512_64( iv[7] );
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -1101,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
sc->partial_len = len;
}
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
{
__m512i pad[1];
uint32_t ch, cl;
@@ -1122,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_BIG \
do { \
@@ -1155,11 +1153,99 @@ do { \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
}
#else
#define INPUT_BIG_sub( db_i ) \
{ \
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_BIG \
{ \
const __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
INPUT_BIG_sub( db ); \
}
#if 0
// dependent on the compiler unrolling the loop
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
tp += 8; \
} \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
b = mm256_xor3( b, a, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
@@ -1501,7 +1588,7 @@ do { /* order is important */ \
sc->h[14] = CE; \
sc->h[15] = CF;
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_8X32 \
{ \
@@ -1857,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,
////////////
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
{
DECL_STATE_BIG
uint32_t tmp;
@@ -1881,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
WRITE_STATE_BIG( sc );
}
void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
{
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_BIG
@@ -1892,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
WRITE_STATE_BIG( sc );
}
void hamsi512_4way_init( hamsi_4way_big_context *sc )
void hamsi512_4x64_init( hamsi512_4x64_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
@@ -1907,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
sc->h[7] = v256_64( iv[7] );
}
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
@@ -1919,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
sc->partial_len = len;
}
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
{
__m256i pad[1];
uint32_t ch, cl;
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sc->h[6] = c6; \
sc->h[7] = c7;
#define INPUT_2x64_sub( db_i ) \
{ \
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_2x64 \
{ \
const v128u64_t db = *buf; \
const v128u64_t zero = v128_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
INPUT_2x64_sub( db ); \
}
#if 0
// Dependent on the compiler unrolling the loop.
#define INPUT_2x64 \
{ \
v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
tp += 8; \
} \
}
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{ \
a = v128_rol32( a, 13 ); \
c = v128_rol32( c, 3 ); \
b = v128_xor3( a, b, c ); \
b = v128_xor3( c, a, b ); \
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
b = v128_rol32( b, 1 ); \
d = v128_rol32( d, 7 ); \

View File

@@ -72,17 +72,17 @@ typedef struct
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
typedef hamsi_4way_big_context hamsi512_4x64_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
void hamsi512_4x64_init( hamsi512_4x64_context *sc );
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
size_t len );
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst );
#define hamsi512_4x64_context hamsi512_4way_context
#define hamsi512_4x64_init hamsi512_4way_init
#define hamsi512_4x64_update hamsi512_4way_update
#define hamsi512_4x64_close hamsi512_4way_close
#define hamsi512_4way_context hamsi512_4x64_context
#define hamsi512_4way_init hamsi512_4x64_init
#define hamsi512_4way_update hamsi512_4x64_update
#define hamsi512_4way_close hamsi512_4x64_close
// Hamsi-512 8x32
@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi-512 8x64
@@ -115,17 +115,17 @@ typedef struct
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_8way_big_context;
typedef hamsi_8way_big_context hamsi512_8way_context;
typedef hamsi_8way_big_context hamsi512_8x64_context;
void hamsi512_8way_init( hamsi512_8way_context *sc );
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
void hamsi512_8x64_init( hamsi512_8x64_context *sc );
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
size_t len );
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst );
#define hamsi512_8x64_context hamsi512_8way_context
#define hamsi512_8x64_init hamsi512_8way_init
#define hamsi512_8x64_update hamsi512_8way_update
#define hamsi512_8x64_close hamsi512_8way_close
#define hamsi512_8way_context hamsi512_8x64_context
#define hamsi512_8way_init hamsi512_8x64_init
#define hamsi512_8way_update hamsi512_8x64_update
#define hamsi512_8way_close hamsi512_8x64_close
// Hamsi-512 16x32

View File

@@ -53,7 +53,7 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
#if defined(VL256)
// ( ~( a ^ b ) ) & c
#define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
// Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
#if defined (VL256)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \

View File

@@ -82,12 +82,15 @@ typedef struct {
typedef haval_4way_context haval256_5_4way_context;
void haval256_5_4way_init( void *cc );
void haval256_5_4way_update( void *cc, const void *data, size_t len );
//#define haval256_5_4way haval256_5_4way_update
void haval256_5_4way_close( void *cc, void *dst );
#define haval256_4x32_context haval256_5_4way_context
#define haval256_4x32_init haval256_5_4way_init
#define haval256_4x32_update haval256_5_4way_update
#define haval256_4x32_close haval256_5_4way_close
#if defined(__AVX2__)
typedef struct {
@@ -100,14 +103,17 @@ typedef struct {
typedef haval_8way_context haval256_5_8way_context;
void haval256_5_8way_init( void *cc );
void haval256_5_8way_update( void *cc, const void *data, size_t len );
void haval256_5_8way_close( void *cc, void *dst );
#define haval256_8x32_context haval256_5_8way_context
#define haval256_8x32_init haval256_5_8way_init
#define haval256_8x32_update haval256_5_8way_update
#define haval256_8x32_close haval256_5_8way_close
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[32];
@@ -119,11 +125,14 @@ typedef struct {
typedef haval_16way_context haval256_5_16way_context;
void haval256_5_16way_init( void *cc );
void haval256_5_16way_update( void *cc, const void *data, size_t len );
void haval256_5_16way_close( void *cc, void *dst );
#define haval256_16x32_context haval256_5_16way_context
#define haval256_16x32_init haval256_5_16way_init
#define haval256_16x32_update haval256_5_16way_update
#define haval256_16x32_close haval256_5_16way_close
#endif // AVX512
#ifdef __cplusplus

View File

@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
(state)->H[15] = h7l; \
} while (0)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define Sb_8W(x0, x1, x2, x3, c) \
{ \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
#if defined(__AVX2__)
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#if defined(VL256)
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void jh256_8x64_init( jh_8x64_context *sc )
{

View File

@@ -55,7 +55,7 @@
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_2WAY)
void keccakhash_2x64(void *state, const void *input)
{
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
keccakhash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ))
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

View File

@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
#elif defined (KECCAK_4WAY)
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#elif defined (KECCAK_2WAY)
gate->scanhash = (void*)&scanhash_keccak_2x64;
gate->hash = (void*)&keccakhash_2x64;
#else
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
bool register_sha3d_algo( algo_gate_t* gate )
{
hard_coded_eb = 6;
// opt_extranonce = false;
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
#if defined (KECCAK_8WAY)
#if defined (SHA3D_8WAY)
gate->scanhash = (void*)&scanhash_sha3d_8way;
gate->hash = (void*)&sha3d_hash_8way;
#elif defined (KECCAK_4WAY)
#elif defined (SHA3D_4WAY)
gate->scanhash = (void*)&scanhash_sha3d_4way;
gate->hash = (void*)&sha3d_hash_4way;
#elif defined (SHA3D_2WAY)
gate->scanhash = (void*)&scanhash_sha3d_2x64;
gate->hash = (void*)&sha3d_hash_2x64;
#else
gate->scanhash = (void*)&scanhash_sha3d;
gate->hash = (void*)&sha3d_hash;

View File

@@ -4,10 +4,20 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define KECCAK_2WAY 1
#endif
#if defined(SIMD512)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA3D_2WAY 1
#endif
extern int hard_coded_eb;
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
void keccakhash_8way( void *state, const void *input );
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(KECCAK_2WAY)
void keccakhash_2x64( void *state, const void *input );
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void keccakhash( void *state, const void *input );
int scanhash_keccak( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA3D_8WAY)
void sha3d_hash_8way( void *state, const void *input );
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way( void *state, const void *input );
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64( void *state, const void *input );
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha3d_hash( void *state, const void *input );
int scanhash_sha3d( struct work *work, uint32_t max_nonce,

View File

@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
#define DO(x) x
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define INPUT_BUF(size) do { \
size_t j; \
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
size_t byte_len, size_t lim )
{
unsigned eb;
union {
__m512i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
size_t j;
size_t m512_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm512_set1_epi64( t );
tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
keccak64_8way_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
@@ -194,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
memcpy_512( dst, kc->w, m512_len );
}
void keccak256_8way_init( void *kc )
void keccak256_8x64_init( void *kc )
{
keccak64_8way_init( kc, 256 );
}
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
size_t j;
size_t m256_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set1_epi64x( t );
tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
keccak64_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );

View File

@@ -4,7 +4,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -4,12 +4,12 @@
#include <stdint.h>
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)
#if defined(SHA3D_8WAY)
void sha3d_hash_8way(void *state, const void *input)
{
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
keccak256_8way_context ctx;
keccak256_8x64_context ctx;
keccak256_8x64_init( &ctx );
keccak256_8x64_update( &ctx, input, 80 );
@@ -64,12 +64,12 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(KECCAK_4WAY)
#elif defined(SHA3D_4WAY)
void sha3d_hash_4way(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4x64_context ctx;
keccak256_4x64_init( &ctx );
keccak256_4x64_update( &ctx, input, 80 );
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA3D_2WAY)
void sha3d_hash_2x64(void *state, const void *input)
{
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_2x64_context ctx;
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, input, 80 );
keccak256_2x64_close( &ctx, buffer );
keccak256_2x64_init( &ctx );
keccak256_2x64_update( &ctx, buffer, 32 );
keccak256_2x64_close( &ctx, state );
}
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
uint32_t hash[16*2] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[13]); // 3*4+1
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
v128_t *noncev = (v128_t*)vdata + 9;
const uint32_t Htarg = ptarget[7];
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do {
sha3d_hash_2x64( hash, vdata );
for ( int lane = 0; lane < 2; lane++ )
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
n += 2;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
uint32_t hash[8*4] __attribute((aligned(128)));
__m512i* chainv = state->chainv;
__m512i t[2];
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
/*---- blank round with m=0 ----*/
rnd512_4way( state, NULL );
@@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
casti_m512i( b,0 ) = _mm512_shuffle_epi8(
casti_m512i( hash,0 ), shuff_bswap32 );
casti_m512i( b,1 ) = _mm512_shuffle_epi8(
casti_m512i( hash,1 ), shuff_bswap32 );
casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
rnd512_4way( state, NULL );
@@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
_mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
casti_m512i( b,2 ) = _mm512_shuffle_epi8(
casti_m512i( hash,0 ), shuff_bswap32 );
casti_m512i( b,3 ) = _mm512_shuffle_epi8(
casti_m512i( hash,1 ), shuff_bswap32 );
casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) );
casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) );
}
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
__m512i msg[2];
int i;
int blocks = (int)len >> 5;
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[0] = mm512_bswap_32( vdata[0] );
buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
}
return 0;
@@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
__m512i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
rnd512_4way( state, msg );
}
@@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m512i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bswap_32( vdata[ 1 ] );
rnd512_4way( state, msg );
}
@@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm512_bswap_32( vdata[ 0 ] );
msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
rnd512_4way( state, msg );
}
@@ -524,8 +512,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -776,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
uint32 hash[8*2] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t0, t1;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/
rnd512_2way( state, NULL );
@@ -792,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, NULL );
@@ -810,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
@@ -848,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
__m256i msg[2];
int i;
int blocks = (int)len >> 5;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state-> rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -865,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
}
return 0;
@@ -917,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -934,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
rnd512_2way( state, msg );
}
@@ -962,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
@@ -979,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
if ( state->rembytes )
{
// padding of partial block
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
rnd512_2way( state, msg );
}

View File

@@ -51,7 +51,7 @@
#define LIMIT_512 128
/*********************************/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
uint32_t buffer[8*4];

View File

@@ -28,8 +28,7 @@
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -48,29 +47,22 @@
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || defined(__SSE2__)
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
}
#else
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___
#endif // LUFFA_FOR_SSE2_H__

View File

@@ -15,7 +15,7 @@
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1
@@ -26,9 +26,9 @@
#if defined (ALLIUM_16WAY)
typedef union {
keccak256_8way_context keccak;
keccak256_8x64_context keccak;
cube_4way_2buf_context cube;
skein256_8way_context skein;
skein256_8x64_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
#else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhashA);
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhashB, 32 );
keccak256_8way_close( &ctx.keccak, vhashB);
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
keccak256_8x64_close( &ctx.keccak, vhashA);
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
keccak256_8x64_close( &ctx.keccak, vhashB);
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashA, 32 );
skein256_8way_close( &ctx.skein, vhashA );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhashB, 32 );
skein256_8way_close( &ctx.skein, vhashB );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhashA, 32 );
skein256_8x64_close( &ctx.skein, vhashA );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhashB, 32 );
skein256_8x64_close( &ctx.skein, vhashB );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
#elif defined (ALLIUM_8WAY)
typedef union {
keccak256_4way_context keccak;
keccak256_4x64_context keccak;
cube_2way_context cube;
skein256_4way_context skein;
skein256_4x64_context skein;
#if defined(__VAES__)
groestl256_2way_context groestl;
#else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
keccak256_4way_close( &ctx.keccak, vhashA );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhashB, 32 );
keccak256_4way_close( &ctx.keccak, vhashB );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
keccak256_4x64_close( &ctx.keccak, vhashA );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
keccak256_4x64_close( &ctx.keccak, vhashB );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashA, 32 );
skein256_4way_close( &ctx.skein, vhashA );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhashB, 32 );
skein256_4way_close( &ctx.skein, vhashB );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhashA, 32 );
skein256_4x64_close( &ctx.skein, vhashA );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhashB, 32 );
skein256_4x64_close( &ctx.skein, vhashB );
#if defined(__VAES__)
@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
n+ 3, n+ 2, n+ 1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -465,12 +465,8 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
//#if defined(__x86_64__)
skein256_2x64_context skein;
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -487,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
uint64_t *hash3 = (uint64_t*)hash+12;
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
@@ -608,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
block_buf[15] = v128_32( 640 );
// Partialy prehash second block without touching nonces
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -636,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
//
// 1 way
typedef struct
{
blake256_context blake;

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV2_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
/////////////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define PHI2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define PHI2_4WAY 1

View File

@@ -41,7 +41,7 @@
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords

View File

@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );

View File

@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
static __thread blake256_4x32_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
blake256_4x32_init( &l2h_4way_blake_mid );
blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
blake256_4x32_close( &ctx_blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -7,25 +7,24 @@
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#if defined (LYRA2REV2_16WAY)
typedef struct {
blake256_16way_context blake;
keccak256_8way_context keccak;
blake256_16x32_context blake;
keccak256_8x64_context keccak;
cubehashParam cube;
skein256_8way_context skein;
bmw256_16way_context bmw;
skein256_8x64_context skein;
bmw256_16x32_context bmw;
} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
bool init_lyra2rev2_16way_ctx()
{
keccak256_8way_init( &l2v2_16way_ctx.keccak );
keccak256_8x64_init( &l2v2_16way_ctx.keccak );
cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &l2v2_16way_ctx.skein );
bmw256_16way_init( &l2v2_16way_ctx.bmw );
skein256_8x64_init( &l2v2_16way_ctx.skein );
bmw256_16x32_init( &l2v2_16way_ctx.bmw );
return true;
}
@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
blake256_16x32_close( &ctx.blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
keccak256_8way_update( &ctx.keccak, vhash, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
keccak256_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, 256 );
keccak256_8way_init( &ctx.keccak );
keccak256_8way_update( &ctx.keccak, vhash, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
keccak256_8x64_init( &ctx.keccak );
keccak256_8x64_update( &ctx.keccak, vhash, 32 );
keccak256_8x64_close( &ctx.keccak, vhash );
dintrlv_8x64( hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )
intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
skein256_8x64_update( &ctx.skein, vhash, 32 );
skein256_8x64_close( &ctx.skein, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12,
hash13, hash14, hash15, 256 );
skein256_8way_init( &ctx.skein );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
skein256_8x64_init( &ctx.skein );
skein256_8x64_update( &ctx.skein, vhash, 32 );
skein256_8x64_close( &ctx.skein, vhash );
dintrlv_8x64( hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash15, 256 );
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
bmw256_16x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
blake256_16way_init( &l2v2_16way_ctx.blake );
blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
blake256_16x32_init( &l2v2_16way_ctx.blake );
blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );
do
{
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV2_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_4way_context keccak;
blake256_8x32_context blake;
keccak256_4x64_context keccak;
cubehashParam cube;
skein256_4way_context skein;
bmw256_8way_context bmw;
skein256_4x64_context skein;
bmw256_8x32_context bmw;
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
bool init_lyra2rev2_8way_ctx()
{
keccak256_4way_init( &l2v2_8way_ctx.keccak );
keccak256_4x64_init( &l2v2_8way_ctx.keccak );
cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_8way_ctx.skein );
bmw256_8way_init( &l2v2_8way_ctx.bmw );
skein256_4x64_init( &l2v2_8way_ctx.skein );
bmw256_8x32_init( &l2v2_8way_ctx.bmw );
return true;
}
@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
blake256_8x32_close( &ctx.blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
keccak256_4way_update( &ctx.keccak, vhash, 32 );
keccak256_4way_close( &ctx.keccak, vhash );
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
keccak256_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
keccak256_4way_init( &ctx.keccak );
keccak256_4way_update( &ctx.keccak, vhash, 32 );
keccak256_4way_close( &ctx.keccak, vhash );
keccak256_4x64_init( &ctx.keccak );
keccak256_4x64_update( &ctx.keccak, vhash, 32 );
keccak256_4x64_close( &ctx.keccak, vhash );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
skein256_4way_update( &ctx.skein, vhash, 32 );
skein256_4way_close( &ctx.skein, vhash );
skein256_4x64_update( &ctx.skein, vhash, 32 );
skein256_4x64_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
skein256_4way_init( &ctx.skein );
skein256_4way_update( &ctx.skein, vhash, 32 );
skein256_4way_close( &ctx.skein, vhash );
skein256_4x64_init( &ctx.skein );
skein256_4x64_update( &ctx.skein, vhash, 32 );
skein256_4x64_close( &ctx.skein, vhash );
dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
bmw256_8x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
blake256_8way_init( &l2v2_8way_ctx.blake );
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
blake256_8x32_init( &l2v2_8way_ctx.blake );
blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );
do
{
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV2_4WAY)
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
blake256_4x32_context blake;
keccak256_4x64_context keccak;
cubehashParam cube;
skein256_4way_context skein;
bmw256_4way_context bmw;
skein256_4x64_context skein;
bmw256_4x32_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
bool init_lyra2rev2_4way_ctx()
{
keccak256_4way_init( &l2v2_4way_ctx.keccak );
keccak256_4x64_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
bmw256_4way_init( &l2v2_4way_ctx.bmw );
skein256_4x64_init( &l2v2_4way_ctx.skein );
bmw256_4x32_init( &l2v2_4way_ctx.bmw );
return true;
}
@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
blake256_4x32_close( &ctx.blake, vhash );
rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
keccak256_4x64_close( &ctx.keccak, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way_update( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
skein256_4x64_update( &ctx.skein, vhash64, 32 );
skein256_4x64_close( &ctx.skein, vhash64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
bmw256_4x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,12 +449,12 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
blake256_4x32_init( &l2v2_4way_ctx.blake );
blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -9,18 +9,18 @@
#if defined (LYRA2REV3_16WAY)
typedef struct {
blake256_16way_context blake;
blake256_16x32_context blake;
cube_4way_context cube;
bmw256_16way_context bmw;
bmw256_16x32_context bmw;
} lyra2v3_16way_ctx_holder;
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
bool init_lyra2rev3_16way_ctx()
{
blake256_16way_init( &l2v3_16way_ctx.blake );
blake256_16x32_init( &l2v3_16way_ctx.blake );
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
bmw256_16way_init( &l2v3_16way_ctx.bmw );
bmw256_16x32_init( &l2v3_16way_ctx.bmw );
return true;
}
@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
blake256_16way_close( &ctx.blake, vhash );
blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
blake256_16x32_close( &ctx.blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
bmw256_16x32_update( &ctx.bmw, vhash, 32 );
bmw256_16x32_close( &ctx.bmw, state );
}
@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake256_16way_init( &l2v3_16way_ctx.blake );
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
blake256_16x32_init( &l2v3_16way_ctx.blake );
blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );
do
{
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
#elif defined (LYRA2REV3_8WAY)
typedef struct {
blake256_8way_context blake;
blake256_8x32_context blake;
cubehashParam cube;
bmw256_8way_context bmw;
bmw256_8x32_context bmw;
} lyra2v3_8way_ctx_holder;
static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;
bool init_lyra2rev3_8way_ctx()
{
blake256_8way_init( &l2v3_8way_ctx.blake );
blake256_8x32_init( &l2v3_8way_ctx.blake );
cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
bmw256_8way_init( &l2v3_8way_ctx.bmw );
bmw256_8x32_init( &l2v3_8way_ctx.bmw );
return true;
}
@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
blake256_8way_update( &ctx.blake, input + (64*8), 16 );
blake256_8way_close( &ctx.blake, vhash );
blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
blake256_8x32_close( &ctx.blake, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
bmw256_8x32_update( &ctx.bmw, vhash, 32 );
bmw256_8x32_close( &ctx.bmw, state );
}
@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
blake256_8way_init( &l2v3_8way_ctx.blake );
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
blake256_8x32_init( &l2v3_8way_ctx.blake );
blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );
do
{
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
#if defined (LYRA2REV3_4WAY)
typedef struct {
blake256_4way_context blake;
blake256_4x32_context blake;
cubehashParam cube;
bmw256_4way_context bmw;
bmw256_4x32_context bmw;
} lyra2v3_4way_ctx_holder;
//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;
bool init_lyra2rev3_4way_ctx()
{
blake256_4way_init( &l2v3_4way_ctx.blake );
blake256_4x32_init( &l2v3_4way_ctx.blake );
cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
bmw256_4way_init( &l2v3_4way_ctx.bmw );
bmw256_4x32_init( &l2v3_4way_ctx.bmw );
return true;
}
@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
blake256_4way_update( &ctx.blake, input + (64*4), 16 );
blake256_4way_close( &ctx.blake, vhash );
blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
blake256_4x32_close( &ctx.blake, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way_update( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
bmw256_4x32_update( &ctx.bmw, vhash, 32 );
bmw256_4x32_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
blake256_4way_init( &l2v3_4way_ctx.blake );
blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
blake256_4x32_init( &l2v3_4way_ctx.blake );
blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );
do
{

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "algo/blake/blake256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2z_4way_blake_mid;
static __thread blake256_4x32_context l2z_4way_blake_mid;
void lyra2z_4way_midstate( const void* input )
{
blake256_4way_init( &l2z_4way_blake_mid );
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
blake256_4x32_init( &l2z_4way_blake_mid );
blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
}
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
// blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
/*
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
*/
blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
block_buf[15] = v128_32( 640 );
// Partialy prehash second block without touching nonces
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
#if defined(LYRA2Z_16WAY)
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_16way;
// gate->hash = (void*)&lyra2z_16way_hash;
#elif defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
// gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;

View File

@@ -4,7 +4,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "lyra2.h"
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#include "algo/echo/echo-hash-4way.h"
#elif defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"

View File

@@ -27,7 +27,7 @@
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{

View File

@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
#define G2W(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
G2W( s0, s1, s2, s3 ); \
s0 = mm512_shufll256_64( s0 ); \
s3 = mm512_swap256_128( s3); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shuflr256_64( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
G2W( s0, s1, s2, s3 ); \
s0 = mm512_shuflr256_64( s0 ); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shufll256_64( s2 );
/*
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shufll256_64( s3 ); \
s1 = mm512_shuflr256_64( s1); \
s2 = mm512_swap256_128( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
*/
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =
#if defined(__AVX2__)
#define G_4X64(a,b,c,d) \
#define G_AVX2(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =
// Pivot about s1 instead of s0 reduces latency.
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
G_AVX2( s0, s1, s2, s3 ); \
s0 = mm256_shufll_64( s0 ); \
s3 = mm256_swap_128( s3); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shuflr_64( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
G_AVX2( s0, s1, s2, s3 ); \
s0 = mm256_shuflr_64( s0 ); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shufll_64( s2 );
/*
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shufll_64( s3 ); \
s1 = mm256_shuflr_64( s1); \
s2 = mm256_swap_128( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
*/
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,29 +124,29 @@ static const uint64_t blake2b_IV[8] =
// process 2 columns in parallel
// returns void, all args updated
#define G_2X64(a,b,c,d) \
#define G_128(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 24 ); \
b = v128_ror64xor( b, c, 24 ); \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 63 );
b = v128_ror64xor( b, c, 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
v128u64_t t; \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
G_128( s0, s2, s4, s6 ); \
G_128( s1, s3, s5, s7 ); \
t = v128_alignr64( s7, s6, 1 ); \
s6 = v128_alignr64( s6, s7, 1 ); \
s7 = t; \
t = v128_alignr64( s2, s3, 1 ); \
s2 = v128_alignr64( s3, s2, 1 ); \
s3 = t; \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
G_128( s0, s2, s5, s6 ); \
G_128( s1, s3, s4, s7 ); \
t = v128_alignr64( s6, s7, 1 ); \
s6 = v128_alignr64( s7, s6, 1 ); \
s7 = t; \
@@ -195,10 +171,6 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G( r, i, a, b, c, d ) \
{ \
a = a + b; \
@@ -222,7 +194,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
union _ovly_512
{

View File

@@ -1,8 +1,6 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -21,7 +19,7 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n( double xt )
static inline double exp_n( double xt )
{
if ( xt < -700.0 )
return 0;
@@ -33,7 +31,8 @@ inline double exp_n( double xt )
return exp( xt );
}
inline double exp_n2( double x1, double x2 )
/*
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;
@@ -53,6 +52,7 @@ inline double exp_n2( double x1, double x2 )
else if ( xt > p6 - 1.e-200 )
return 0.;
}
*/
double swit2_( double wvnmb )
{
@@ -298,15 +298,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -1,75 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <iostream>
#include <cfloat>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "magimath.h"
#define EPS1 (std::numeric_limits<double>::epsilon())
#define EPS2 3.0e-11
static void gauleg(double x1, double x2, double x[], double w[], const int n)
{
int m,j,i;
double z1, z, xm, xl, pp, p3, p2, p1;
m=(n+1)/2;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for (i=1;i<=m;i++) {
z=cos(3.141592654*(i-0.25)/(n+0.5));
do {
p1=1.0;
p2=0.0;
for (j=1;j<=n;j++) {
p3=p2;
p2=p1;
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
}
pp=n*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > EPS2);
x[i]=xm-xl*z;
x[n+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[n+1-i]=w[i];
}
}
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
{
double s=0.0;
#ifdef _MSC_VER
#define SW_DIVS 23
double x[SW_DIVS+1], w[SW_DIVS+1];
#else
double x[NptGQ+1], w[NptGQ+1];
#endif
gauleg(a2, b2, x, w, NptGQ);
for (int j=1; j<=NptGQ; j++) {
s += w[j]*func(x[j]);
}
return s;
}
static double swit_(double wvnmb)
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
}
uint32_t sw_(int nnounce, int divs)
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
}

View File

@@ -1,54 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef MAGI_MATH_H
#define MAGI_MATH_H
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t sw_(int nnounce, int divs);
#ifdef __cplusplus
}
#endif
inline double exp_n(double xt)
{
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
if(xt < p1)
return 0;
else if(xt > p6)
return 1e200;
else if(xt > p3 && xt < p4)
return (1.0 + xt);
else
return exp(xt);
}
// 1 / (1 + exp(x1-x2))
inline double exp_n2(double x1, double x2)
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
return ( exp(-xt) );
else //if (xt > p6 - 1.e-200)
return 0.;
}
#endif

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define NIST5_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY 1

View File

@@ -71,8 +71,7 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
@@ -312,7 +311,7 @@ do { \
BUPDATE1_8W( 7, 1 ); \
} while (0)
#if defined(__AVX512VL__)
#if defined(VL256)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )

View File

@@ -18,11 +18,14 @@ typedef struct {
} panama_4way_context __attribute__ ((aligned (64)));
void panama_4way_init( void *cc );
void panama_4way_update( void *cc, const void *data, size_t len );
void panama_4way_close( void *cc, void *dst );
#define panama_4x32_context panama_4way_context
#define panama_4x32_init panama_4way_init
#define panama_4x32_update panama_4way_update
#define panama_4x32_close panama_4way_close
#if defined(__AVX2__)
typedef struct {
@@ -34,10 +37,13 @@ typedef struct {
} panama_8way_context __attribute__ ((aligned (128)));
void panama_8way_init( void *cc );
void panama_8way_update( void *cc, const void *data, size_t len );
void panama_8way_close( void *cc, void *dst );
#define panama_8x32_context panama_8way_context
#define panama_8x32_init panama_8way_init
#define panama_8x32_update panama_8way_update
#define panama_8x32_close panama_8way_close
#endif
#endif

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ANIME_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY 1

View File

@@ -11,7 +11,6 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
@@ -32,20 +31,20 @@
union _hmq1725_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
blake512_8x64_context blake;
bmw512_8x64_context bmw;
skein512_8x64_context skein;
jh512_8x64_context jh;
keccak512_8x64_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
simd_4way_context simd;
hamsi512_8way_context hamsi;
hamsi512_8x64_context hamsi;
hashState_fugue fugue;
shabal512_8way_context shabal;
shabal512_8x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
sha512_8x64_context sha512;
haval256_8x32_context haval;
#if defined(__VAES__)
groestl512_4way_context groestl;
shavite512_4way_context shavite;
@@ -82,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
__m512i* vhB = (__m512i*)vhashB;
__m512i* vhC = (__m512i*)vhashC;
bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
bmw512_8x64_full( &ctx.bmw, vhash, input, 80 );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -142,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 );
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhash );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
// A
if ( ( vh_mask & 0xff ) != 0xff )
blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 );
// B
if ( vh_mask & 0xff )
bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 );
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
@@ -177,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input)
if ( likely( ( vh_mask & 0xff ) != 0xff ) )
{
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhashA );
keccak512_8x64_init( &ctx.keccak );
keccak512_8x64_update( &ctx.keccak, vhash, 64 );
keccak512_8x64_close( &ctx.keccak, vhashA );
}
if ( likely( vh_mask & 0xff ) )
{
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhashB );
jh512_8x64_init( &ctx.jh );
jh512_8x64_update( &ctx.jh, vhash, 64 );
jh512_8x64_close( &ctx.jh, vhashB );
}
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -252,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
{
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhash, 64 );
haval256_8x32_close( &ctx.haval, vhash );
memset( &vhash[8<<3], 0, 32<<3 );
rintrlv_8x32_8x64( vhashB, vhash, 512 );
}
@@ -297,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
#endif
blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
blake512_8x64_full( &ctx.blake, vhash, vhash, 64 );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
@@ -352,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
hamsi512_8x64_init( &ctx.hamsi );
hamsi512_8x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_8x64_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -430,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
rintrlv_8x64_8x32( vhashA, vhash, 512 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhashA, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
shabal512_8x32_init( &ctx.shabal );
shabal512_8x32_update( &ctx.shabal, vhashA, 64 );
shabal512_8x32_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash );
@@ -475,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
// B
if ( likely( vh_mask & 0xff ) )
{
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhashB );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhashB );
}
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
@@ -510,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
#endif
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
sha512_8x64_init( &ctx.sha512 );
sha512_8x64_update( &ctx.sha512, vhash, 64 );
sha512_8x64_close( &ctx.sha512, vhash );
vh_mask = _mm512_testn_epi64_mask( vh[0], vmask );
dintrlv_8x64_512( hash0, hash1, hash2, hash3,
@@ -523,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
{
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash, 64 );
haval256_5_8way_close( &ctx.haval, vhash );
haval256_8x32_init( &ctx.haval );
haval256_8x32_update( &ctx.haval, vhash, 64 );
haval256_8x32_close( &ctx.haval, vhash );
memset( &vhash[8<<3], 0, 32<<3 );
rintrlv_8x32_8x64( vhashA, vhash, 512 );
}
@@ -552,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input)
hash7 );
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, state );
bmw512_8x64_init( &ctx.bmw );
bmw512_8x64_update( &ctx.bmw, vhash, 64 );
bmw512_8x64_close( &ctx.bmw, state );
}
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
@@ -606,27 +605,27 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
union _hmq1725_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
blake512_4x64_context blake;
bmw512_4x64_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
skein512_4x64_context skein;
jh512_4x64_context jh;
keccak512_4x64_context keccak;
hashState_luffa luffa;
luffa_2way_context luffa2;
cubehashParam cube;
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
simd512_context simd;
shavite512_2way_context shavite2;
simd_2way_context simd;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hamsi512_4x64_context hamsi;
hashState_fugue fugue;
shabal512_4way_context shabal;
shabal512_4x32_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
sha512_4x64_context sha512;
haval256_4x32_context haval;
#if defined(__VAES__)
groestl512_2way_context groestl2;
echo_2way_context echo2;
@@ -653,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, input, 80 );
bmw512_4x64_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -687,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input)
// B
if ( h_mask & 0xffffffff )
skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 );
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhash );
// second fork, A = blake parallel, B= bmw parallel.
@@ -705,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input)
h_mask = _mm256_movemask_epi8( vh_mask );
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 );
if ( h_mask & 0xffffffff )
{
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -734,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
{
keccak512_4way_init( &ctx.keccak );
keccak512_4way_update( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
keccak512_4x64_init( &ctx.keccak );
keccak512_4x64_update( &ctx.keccak, vhash, 64 );
keccak512_4x64_close( &ctx.keccak, vhashA );
}
if ( h_mask & 0xffffffff )
{
jh512_4way_init( &ctx.jh );
jh512_4way_update( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
jh512_4x64_init( &ctx.jh );
jh512_4x64_update( &ctx.jh, vhash, 64 );
jh512_4x64_close( &ctx.jh, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
@@ -779,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
// B
if ( h_mask & 0xffffffff )
{
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhash, 64 );
haval256_4x32_close( &ctx.haval, vhash );
memset( &vhash[8<<2], 0, 32<<2 );
rintrlv_4x32_4x64( vhashB, vhash, 512 );
}
@@ -814,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
#endif
blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
blake512_4x64_full( &ctx.blake, vhash, vhash, 64 );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -846,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
hamsi512_4x64_init( &ctx.hamsi );
hamsi512_4x64_update( &ctx.hamsi, vhash, 64 );
hamsi512_4x64_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -869,47 +868,31 @@ extern void hmq1725_4way_hash(void *state, const void *input)
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
(const BitSequence *)hash0, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
if ( hash1[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
(const BitSequence *)hash1, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
if ( hash2[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
(const BitSequence *)hash2, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
if ( hash3[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
(const BitSequence *)hash3, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
shabal512_4x32_init( &ctx.shabal );
shabal512_4x32_update( &ctx.shabal, vhash, 64 );
shabal512_4x32_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -938,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( h_mask & 0xffffffff )
{
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhashB );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhashB );
}
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
@@ -967,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
#endif
sha512_4way_init( &ctx.sha512 );
sha512_4way_update( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
sha512_4x64_init( &ctx.sha512 );
sha512_4x64_update( &ctx.sha512, vhash, 64 );
sha512_4x64_close( &ctx.sha512, vhash );
// A = haval parallel, B = Whirlpool serial
@@ -981,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
if ( ( h_mask & 0xffffffff ) != 0xffffffff )
{
haval256_5_4way_init( &ctx.haval );
haval256_5_4way_update( &ctx.haval, vhash, 64 );
haval256_5_4way_close( &ctx.haval, vhash );
haval256_4x32_init( &ctx.haval );
haval256_4x32_update( &ctx.haval, vhash, 64 );
haval256_4x32_close( &ctx.haval, vhash );
memset( &vhash[8<<2], 0, 32<<2 );
rintrlv_4x32_4x64( vhashA, vhash, 512 );
}
@@ -1001,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input)
mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
bmw512_4way_init( &ctx.bmw );
bmw512_4way_update( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, state );
bmw512_4x64_init( &ctx.bmw );
bmw512_4x64_update( &ctx.bmw, vhash, 64 );
bmw512_4x64_close( &ctx.bmw, state );
}
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,

View File

@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
init_hmq1725_ctx();
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
void hmq1725hash( void *state, const void *input );
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_hmq1725_ctx();
#endif

View File

@@ -4,367 +4,273 @@
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake1, blake2;
sph_bmw512_context bmw1, bmw2, bmw3;
sph_skein512_context skein1, skein2;
sph_jh512_context jh1, jh2;
sph_keccak512_context keccak1, keccak2;
hashState_luffa luffa1, luffa2;
cubehashParam cube;
sph_shavite512_context shavite1, shavite2;
#if defined(__aarch64__)
sph_simd512_context simd1, simd2;
#else
hashState_sd simd1, simd2;
#endif
sph_hamsi512_context hamsi1;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
sph_sha512_context sha1, sha2;
sph_haval256_5_context haval1, haval2;
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
hashState_fugue fugue1, fugue2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
sph_fugue512_context fugue1, fugue2;
#endif
} hmq1725_ctx_holder;
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
void init_hmq1725_ctx()
union _hmq1725_ctx_holder
{
sph_blake512_init(&hmq1725_ctx.blake1);
sph_blake512_init(&hmq1725_ctx.blake2);
sph_bmw512_init(&hmq1725_ctx.bmw1);
sph_bmw512_init(&hmq1725_ctx.bmw2);
sph_bmw512_init(&hmq1725_ctx.bmw3);
sph_skein512_init(&hmq1725_ctx.skein1);
sph_skein512_init(&hmq1725_ctx.skein2);
sph_jh512_init(&hmq1725_ctx.jh1);
sph_jh512_init(&hmq1725_ctx.jh2);
sph_keccak512_init(&hmq1725_ctx.keccak1);
sph_keccak512_init(&hmq1725_ctx.keccak2);
init_luffa( &hmq1725_ctx.luffa1, 512 );
init_luffa( &hmq1725_ctx.luffa2, 512 );
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
sph_shavite512_init(&hmq1725_ctx.shavite1);
sph_shavite512_init(&hmq1725_ctx.shavite2);
#if defined(__aarch64__)
sph_simd512_init(&hmq1725_ctx.simd1);
sph_simd512_init(&hmq1725_ctx.simd2);
#else
init_sd( &hmq1725_ctx.simd1, 512 );
init_sd( &hmq1725_ctx.simd2, 512 );
#endif
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
#if defined(__AES__)
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_fugue512_init(&hmq1725_ctx.fugue1);
sph_fugue512_init(&hmq1725_ctx.fugue2);
sph_fugue512_context fugue;
#endif
sph_shabal512_init(&hmq1725_ctx.shabal1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
sph_sha512_init( &hmq1725_ctx.sha1 );
sph_sha512_init( &hmq1725_ctx.sha2 );
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
}
void hmq_bmw512_midstate( const void* input )
{
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
sph_bmw512( &hmq_bmw_mid, input, 64 );
}
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha;
sph_haval256_5_context haval;
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
uint32_t hashA[32] __attribute__((aligned(32)));
uint32_t hashB[32] __attribute__((aligned(32)));
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hashA ); //1
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
sph_groestl512_close( &ctx.groestl, hashA ); //2
#endif
}
else
{
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
sph_skein512_close(&h_ctx.skein1, hashA); //2
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hashB, 64 ); //1
sph_skein512_close( &ctx.skein, hashA ); //2
}
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
sph_jh512_close(&h_ctx.jh1, hashB); //4
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashA, 64 ); //3
sph_jh512_close( &ctx.jh, hashB ); //4
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
sph_keccak512_close( &ctx.keccak, hashA ); //3
if ( hashA[0] & mask ) //4
{
sph_blake512 (&h_ctx.blake1, hashA, 64); //
sph_blake512_close(&h_ctx.blake1, hashB); //5
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
}
else
{
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
sph_bmw512_close( &ctx.bmw, hashB ); //5
}
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
if ( hashB[0] & mask ) //7
{
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hashB, 64 ); //
sph_keccak512_close( &ctx.keccak, hashA ); //8
}
else
{
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
sph_jh512_close(&h_ctx.jh2, hashA); //8
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hashB, 64 ); //7
sph_jh512_close( &ctx.jh, hashA ); //8
}
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
sph_shavite512_close( &ctx.shavite, hashB ); //4
#if defined(__aarch64__)
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
sph_simd512_close(&h_ctx.simd1, hashA); //4
#else
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#endif
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
if ( hashA[0] & mask ) //4
{
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
else
{
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset(&hashB[8], 0, 32);
}
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashB, 64 ); //5
sph_echo512_close( &ctx.echo, hashA ); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
sph_blake512_close(&h_ctx.blake2, hashB); //7
blake512_init( &ctx.blake );
blake512_update( &ctx.blake, hashA, 64 );
blake512_close( &ctx.blake, hashB );
if ( hashB[0] & mask ) //7
{
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hashB, 64 ); //
sph_shavite512_close( &ctx.shavite, hashA ); //8
}
else
{
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
}
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
fugue512_Final( &h_ctx.fugue1, hashA ); //3
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
sph_fugue512_close( &ctx.fugue, hashA ); //3
#endif
if ( hashA[0] & mask ) //4
{
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, hashA, 64 ); //
sph_echo512_close( &ctx.echo, hashB ); //5
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&h_ctx.simd2, hashA, 64); //6
sph_simd512_close(&h_ctx.simd2, hashB); //7
#else
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
sph_shabal512_init( &ctx.shabal );
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
sph_shabal512_close( &ctx.shabal, hashA ); //6
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
fugue512_Final( &h_ctx.fugue2, hashA ); //8
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hashB, 64 ); //
sph_fugue512_close( &ctx.fugue, hashA ); //8
#endif
}
else
{
sph_sha512( &h_ctx.sha1, hashB, 64 );
sph_sha512_close( &h_ctx.sha1, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
sph_groestl512_close( &ctx.groestl, hashB ); //4
#endif
sph_sha512( &h_ctx.sha2, hashB, 64 );
sph_sha512_close( &h_ctx.sha2, hashA );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hashB, 64 );
sph_sha512_close( &ctx.sha, hashA );
if ( hashA[0] & mask ) //4
{
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
memset(&hashB[8], 0, 32);
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hashA, 64 ); //
sph_haval256_5_close( &ctx.haval, hashB ); //5
memset( &hashB[8], 0, 32 );
}
else
{
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
}
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
sph_bmw512_close( &ctx.bmw, hashA ); //6
memcpy(state, hashA, 32);
memcpy( state, hashA, 32 );
}
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUARK_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY 1

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

Some files were not shown because too many files have changed in this diff Show More