mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
8e91bfbe19 | ||
![]() |
47e24b50e8 | ||
![]() |
c47c4a8885 | ||
![]() |
042d13d1e1 | ||
![]() |
4f930574cc | ||
![]() |
9d3a46c355 | ||
![]() |
4e3f1b926f | ||
![]() |
045b42babf | ||
![]() |
fc696dbbe5 | ||
![]() |
f3fde95f27 |
@@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer
|
|||||||
dist_man_MANS = cpuminer.1
|
dist_man_MANS = cpuminer.1
|
||||||
|
|
||||||
cpuminer_SOURCES = \
|
cpuminer_SOURCES = \
|
||||||
|
dummy.cpp \
|
||||||
cpu-miner.c \
|
cpu-miner.c \
|
||||||
util.c \
|
util.c \
|
||||||
api.c \
|
api.c \
|
||||||
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
|
|||||||
algo/lyra2/phi2-4way.c \
|
algo/lyra2/phi2-4way.c \
|
||||||
algo/lyra2/phi2.c \
|
algo/lyra2/phi2.c \
|
||||||
algo/m7m/m7m.c \
|
algo/m7m/m7m.c \
|
||||||
algo/m7m/magimath.cpp \
|
|
||||||
algo/nist5/nist5-gate.c \
|
algo/nist5/nist5-gate.c \
|
||||||
algo/nist5/nist5-4way.c \
|
algo/nist5/nist5-4way.c \
|
||||||
algo/nist5/nist5.c \
|
algo/nist5/nist5.c \
|
||||||
@@ -166,7 +166,6 @@ cpuminer_SOURCES = \
|
|||||||
algo/shavite/sph-shavite-aesni.c \
|
algo/shavite/sph-shavite-aesni.c \
|
||||||
algo/shavite/shavite-hash-2way.c \
|
algo/shavite/shavite-hash-2way.c \
|
||||||
algo/shavite/shavite-hash-4way.c \
|
algo/shavite/shavite-hash-4way.c \
|
||||||
algo/shavite/shavite.c \
|
|
||||||
algo/simd/nist.c \
|
algo/simd/nist.c \
|
||||||
algo/simd/vector.c \
|
algo/simd/vector.c \
|
||||||
algo/simd/sph_simd.c \
|
algo/simd/sph_simd.c \
|
||||||
@@ -250,6 +249,7 @@ cpuminer_SOURCES = \
|
|||||||
algo/x16/x16rt.c \
|
algo/x16/x16rt.c \
|
||||||
algo/x16/x16rt-4way.c \
|
algo/x16/x16rt-4way.c \
|
||||||
algo/x16/hex.c \
|
algo/x16/hex.c \
|
||||||
|
algo/x16/x20r.c \
|
||||||
algo/x16/x21s-4way.c \
|
algo/x16/x21s-4way.c \
|
||||||
algo/x16/x21s.c \
|
algo/x16/x21s.c \
|
||||||
algo/x16/minotaur.c \
|
algo/x16/minotaur.c \
|
||||||
@@ -288,7 +288,7 @@ if HAVE_WINDOWS
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
cpuminer_LDFLAGS = @LDFLAGS@
|
cpuminer_LDFLAGS = @LDFLAGS@
|
||||||
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
|
||||||
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
|
||||||
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
|
||||||
|
|
||||||
|
@@ -87,7 +87,6 @@ Supported Algorithms
|
|||||||
groestl Groestl coin
|
groestl Groestl coin
|
||||||
hex x16r-hex
|
hex x16r-hex
|
||||||
hmq1725
|
hmq1725
|
||||||
hodl Hodlcoin
|
|
||||||
jha Jackpotcoin
|
jha Jackpotcoin
|
||||||
keccak Maxcoin
|
keccak Maxcoin
|
||||||
keccakc Creative coin
|
keccakc Creative coin
|
||||||
@@ -115,9 +114,11 @@ Supported Algorithms
|
|||||||
scrypt:N scrypt(N, 1, 1)
|
scrypt:N scrypt(N, 1, 1)
|
||||||
scryptn2 scrypt(1048576, 1, 1)
|
scryptn2 scrypt(1048576, 1, 1)
|
||||||
sha256d Double SHA-256
|
sha256d Double SHA-256
|
||||||
|
sha256dt
|
||||||
sha256q Quad SHA-256
|
sha256q Quad SHA-256
|
||||||
sha256t Triple SHA-256
|
sha256t Triple SHA-256
|
||||||
sha3d Double keccak256 (BSHA3)
|
sha3d Double keccak256 (BSHA3)
|
||||||
|
sha512256d
|
||||||
skein Skein+Sha (Skeincoin)
|
skein Skein+Sha (Skeincoin)
|
||||||
skein2 Double Skein (Woodcoin)
|
skein2 Double Skein (Woodcoin)
|
||||||
skunk Signatum (SIGT)
|
skunk Signatum (SIGT)
|
||||||
@@ -145,6 +146,7 @@ Supported Algorithms
|
|||||||
x16rt-veil veil
|
x16rt-veil veil
|
||||||
x16s
|
x16s
|
||||||
x17
|
x17
|
||||||
|
x20r
|
||||||
x21s
|
x21s
|
||||||
x22i
|
x22i
|
||||||
x25x
|
x25x
|
||||||
|
@@ -27,17 +27,19 @@ See INSTALL_LINUX or INSTALL_WINDOWS for compile instructions
|
|||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
|
||||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
- A x86_64 architecture CPU with a minimum of SSE2 support. This includes Intel Core2 and newer and AMD equivalents.
|
||||||
supported.
|
- Arm CPU supporting AArch64 and NEON.
|
||||||
|
|
||||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
32 bit CPUs are not supported.
|
||||||
are not supported. FreeBSD YMMV.
|
|
||||||
|
|
||||||
ARM requirements (Beta):
|
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
|
||||||
|
|
||||||
CPU: Armv8 and NEON, SHA2 & AES are optional
|
Mining on mobile devices that meet the requirements is not recommended due to the risk of
|
||||||
OS: Linux distribution built for AArch64.
|
overheating and damaging the battery. Mining has unlimited demand, it will push any device
|
||||||
Packages: source code only.
|
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
|
||||||
|
|
||||||
|
Beware of apps claiming "mobile only mining". There is no such thing, they aren't miners.
|
||||||
|
If a mobile CPU can mine it any CPU can.
|
||||||
|
|
||||||
See wiki for details.
|
See wiki for details.
|
||||||
|
|
||||||
@@ -73,6 +75,77 @@ If not what makes it happen or not happen?
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v24.5
|
||||||
|
|
||||||
|
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||||
|
#427: GBT: Improved handling of new work.
|
||||||
|
Removed shavite3 algo.
|
||||||
|
|
||||||
|
v24.4
|
||||||
|
|
||||||
|
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
|
||||||
|
x86_64: fixed a bug in alignr macros for SSE2.
|
||||||
|
ARM: CPU feature reporting enhancements.
|
||||||
|
Some code cleanup.
|
||||||
|
|
||||||
|
v24.3
|
||||||
|
|
||||||
|
ARM: CPU feature detection and reporting is now working.
|
||||||
|
ARM: Verthash is now working.
|
||||||
|
ARM: Small speedup for yescrypt, yespower & argon2d.
|
||||||
|
Code cleanup.
|
||||||
|
|
||||||
|
v24.2
|
||||||
|
|
||||||
|
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
|
||||||
|
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
|
||||||
|
ARM NEON: Various code optimisations.
|
||||||
|
|
||||||
|
v24.1
|
||||||
|
|
||||||
|
#414: fix bug in merkle error handling.
|
||||||
|
#416: change $nproc to $(nproc) in build scripts.
|
||||||
|
#420: change some inline function definitions to static inline.
|
||||||
|
#413: Fix formatting error for share result log when using no-color.
|
||||||
|
Faster 2 way interleaving.
|
||||||
|
Cleanup sha256 architecture targetting.
|
||||||
|
|
||||||
|
v23.15
|
||||||
|
|
||||||
|
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
|
||||||
|
ARM: Fugue AES optimizations enabled.
|
||||||
|
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
|
||||||
|
|
||||||
|
v23.14
|
||||||
|
|
||||||
|
ARM: Groestl AES optimizations enabled.
|
||||||
|
All: Small optimization to Shabal 4way.
|
||||||
|
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
|
||||||
|
All: deleted some unused files.
|
||||||
|
|
||||||
|
v23.13
|
||||||
|
|
||||||
|
Added x20r algo.
|
||||||
|
Eliminated redundant hash order calculations for x16r family.
|
||||||
|
|
||||||
|
v23.12
|
||||||
|
|
||||||
|
Several bugs fixes and speed improvements for x16r family for all CPU architectures.
|
||||||
|
|
||||||
|
v23.11
|
||||||
|
|
||||||
|
This is a release candidate for full AArch64 support, marking the end of the Beta phase.
|
||||||
|
Fixed hmq1725 & x25x algos, SSE2 & NEON, broken in v3.23.4.
|
||||||
|
Most CPU-mineable SHA3 algos (X*) upgraded to 2-way SSE2 & NEON.
|
||||||
|
|
||||||
|
v23.10
|
||||||
|
|
||||||
|
x86_64: Fixed scrypt, scryptn2 algos SSE2.
|
||||||
|
Fixed sha512256d algo AVX2, SSE2, NEON.
|
||||||
|
Fixed a bug in Skein N-way that reduced performance.
|
||||||
|
ARM: Skein optimized for NEON, SHA2 & SSE2.
|
||||||
|
Skein2 algo 2-way optimized for NEON & SSE2.
|
||||||
|
|
||||||
v23.9
|
v23.9
|
||||||
|
|
||||||
x86_64: fixed minotaurx crash, broken in 23.7.
|
x86_64: fixed minotaurx crash, broken in 23.7.
|
||||||
|
@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
|
||||||
// uint64_t *hashes_done, struct thr_info *mythr )
|
// uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
|
|||||||
gate->build_block_header = (void*)&std_build_block_header;
|
gate->build_block_header = (void*)&std_build_block_header;
|
||||||
gate->build_extraheader = (void*)&std_build_extraheader;
|
gate->build_extraheader = (void*)&std_build_extraheader;
|
||||||
gate->set_work_data_endian = (void*)&do_nothing;
|
gate->set_work_data_endian = (void*)&do_nothing;
|
||||||
gate->resync_threads = (void*)&do_nothing;
|
// gate->resync_threads = (void*)&do_nothing;
|
||||||
gate->do_this_thread = (void*)&return_true;
|
// gate->do_this_thread = (void*)&return_true;
|
||||||
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
|
||||||
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
gate->get_work_data_size = (void*)&std_get_work_data_size;
|
||||||
gate->optimizations = EMPTY_SET;
|
gate->optimizations = EMPTY_SET;
|
||||||
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
|||||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
|
||||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||||
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;
|
||||||
@@ -368,6 +367,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
|||||||
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
|
||||||
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
|
||||||
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
case ALGO_X17: rc = register_x17_algo ( gate ); break;
|
||||||
|
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
|
||||||
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
|
||||||
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||||
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||||
|
@@ -98,25 +98,27 @@ typedef uint32_t set_t;
|
|||||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||||
#define SHA512_OPT 1 << 10 // AArch64
|
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
|
||||||
#define NEON_OPT 1 << 11 // AArch64
|
#define NEON_OPT 1 << 11 // AArch64
|
||||||
|
#define AVX10_256 1 << 12
|
||||||
|
#define AVX10_512 1 << 13
|
||||||
|
|
||||||
// AVX10 does not have explicit algo features:
|
// AVX10 does not have explicit algo features:
|
||||||
// AVX10_512 is compatible with AVX512 + VAES
|
// AVX10_512 is compatible with AVX512 + VAES
|
||||||
// AVX10_256 is compatible with AVX2 + VAES
|
// AVX10_256 is compatible with AVX2 + VAES
|
||||||
|
|
||||||
// return set containing all elements from sets a & b
|
// return set containing all elements from sets a & b
|
||||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||||
|
|
||||||
// return set contained common elements from sets a & b
|
// return set contained common elements from sets a & b
|
||||||
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
|
||||||
|
|
||||||
// all elements in set a are included in set b
|
// all elements in set a are included in set b
|
||||||
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
|
||||||
|
|
||||||
// no elements in set a are included in set b
|
// no elements in set a are included in set b
|
||||||
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
|
|||||||
void ( *set_work_data_endian ) ( struct work* );
|
void ( *set_work_data_endian ) ( struct work* );
|
||||||
|
|
||||||
// Diverge mining threads
|
// Diverge mining threads
|
||||||
bool ( *do_this_thread ) ( int );
|
//bool ( *do_this_thread ) ( int );
|
||||||
|
|
||||||
// After do_this_thread
|
// After do_this_thread
|
||||||
void ( *resync_threads ) ( int, struct work* );
|
//void ( *resync_threads ) ( int, struct work* );
|
||||||
|
|
||||||
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
|
||||||
|
|
||||||
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
|
||||||
// uint64_t *hashes_done, struct thr_info *mythr );
|
// uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
@@ -35,7 +35,7 @@
|
|||||||
* @pre all block pointers must be valid
|
* @pre all block pointers must be valid
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static inline __m512i blamka( __m512i x, __m512i y )
|
static inline __m512i blamka( __m512i x, __m512i y )
|
||||||
{
|
{
|
||||||
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
|
|||||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||||
uint32_t prev_offset, curr_offset;
|
uint32_t prev_offset, curr_offset;
|
||||||
uint32_t starting_index, i;
|
uint32_t starting_index, i;
|
||||||
#if defined(__AVX512F__)
|
#if defined(SIMD512)
|
||||||
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
__m256i state[ARGON2_HWORDS_IN_BLOCK];
|
||||||
|
@@ -21,7 +21,7 @@
|
|||||||
#include "blake2-impl.h"
|
#include "blake2-impl.h"
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if !defined(__AVX512F__)
|
#if !defined(SIMD512)
|
||||||
|
|
||||||
#if !defined(__AVX2__)
|
#if !defined(__AVX2__)
|
||||||
|
|
||||||
|
@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
|||||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
|
|
||||||
blakehash_4way( hash, vdata );
|
blakehash_4way( hash, vdata );
|
||||||
|
|
||||||
|
@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
|||||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||||
{ \
|
{ \
|
||||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
||||||
0x0405060700010203 ); \
|
0x0405060700010203 ); \
|
||||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
||||||
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
const v128_t shuf_bswap32 =
|
const v128_t shuf_bswap32 =
|
||||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
|
|
||||||
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||||
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||||
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||||
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||||
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||||
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||||
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||||
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
//Blake-256 16 way AVX512
|
//Blake-256 16 way AVX512
|
||||||
|
|
||||||
|
@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
|
|||||||
#define blake256r8_8x32_update blake256r14_8way_update
|
#define blake256r8_8x32_update blake256r14_8way_update
|
||||||
#define blake256r8_8x32_close blake256r14_8way_close
|
#define blake256r8_8x32_close blake256r14_8way_close
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
//
|
//
|
||||||
|
@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
|
|||||||
#define Mx_(n) Mx__(n)
|
#define Mx_(n) Mx__(n)
|
||||||
#define Mx__(n) M ## n
|
#define Mx__(n) M ## n
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define B2B8W_G(a, b, c, d, x, y) \
|
#define B2B8W_G(a, b, c, d, x, y) \
|
||||||
{ \
|
{ \
|
||||||
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
|
|||||||
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
|
||||||
{
|
{
|
||||||
__m512i v[16], m[16];
|
__m512i v[16], m[16];
|
||||||
|
|
||||||
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
|||||||
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||||
size_t inlen )
|
size_t inlen )
|
||||||
{
|
{
|
||||||
__m512i* in =(__m512i*)input;
|
__m512i* in =(__m512i*)input;
|
||||||
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
|||||||
ctx->t[0] += ctx->c;
|
ctx->t[0] += ctx->c;
|
||||||
if ( ctx->t[0] < ctx->c )
|
if ( ctx->t[0] < ctx->c )
|
||||||
ctx->t[1]++;
|
ctx->t[1]++;
|
||||||
blake2b_8way_compress( ctx, 0 );
|
blake2b_8x64_compress( ctx, 0 );
|
||||||
ctx->c = 0;
|
ctx->c = 0;
|
||||||
}
|
}
|
||||||
ctx->b[ c++ ] = in[i];
|
ctx->b[ c++ ] = in[i];
|
||||||
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
|
||||||
{
|
{
|
||||||
size_t c;
|
size_t c;
|
||||||
c = ctx->c >> 3;
|
c = ctx->c >> 3;
|
||||||
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
|||||||
ctx->c += 8;
|
ctx->c += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
|
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
|
||||||
|
|
||||||
casti_m512i( out, 0 ) = ctx->h[0];
|
casti_m512i( out, 0 ) = ctx->h[0];
|
||||||
casti_m512i( out, 1 ) = ctx->h[1];
|
casti_m512i( out, 1 ) = ctx->h[1];
|
||||||
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
|
||||||
{
|
{
|
||||||
__m256i v[16], m[16];
|
__m256i v[16], m[16];
|
||||||
|
|
||||||
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
|||||||
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||||
size_t inlen )
|
size_t inlen )
|
||||||
{
|
{
|
||||||
__m256i* in =(__m256i*)input;
|
__m256i* in =(__m256i*)input;
|
||||||
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
|||||||
ctx->t[0] += ctx->c;
|
ctx->t[0] += ctx->c;
|
||||||
if ( ctx->t[0] < ctx->c )
|
if ( ctx->t[0] < ctx->c )
|
||||||
ctx->t[1]++;
|
ctx->t[1]++;
|
||||||
blake2b_4way_compress( ctx, 0 );
|
blake2b_4x64_compress( ctx, 0 );
|
||||||
ctx->c = 0;
|
ctx->c = 0;
|
||||||
}
|
}
|
||||||
ctx->b[ c++ ] = in[i];
|
ctx->b[ c++ ] = in[i];
|
||||||
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
|
||||||
{
|
{
|
||||||
size_t c;
|
size_t c;
|
||||||
c = ctx->c >> 3;
|
c = ctx->c >> 3;
|
||||||
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
|||||||
ctx->c += 8;
|
ctx->c += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
|
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
|
||||||
|
|
||||||
casti_m256i( out, 0 ) = ctx->h[0];
|
casti_m256i( out, 0 ) = ctx->h[0];
|
||||||
casti_m256i( out, 1 ) = ctx->h[1];
|
casti_m256i( out, 1 ) = ctx->h[1];
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#ifndef __BLAKE2B_HASH_4WAY_H__
|
#ifndef BLAKE2B_HASH_4WAY_H__
|
||||||
#define __BLAKE2B_HASH_4WAY_H__
|
#define BLAKE2B_HASH_4WAY_H__
|
||||||
|
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) {
|
typedef struct ALIGN( 64 ) {
|
||||||
__m512i b[16]; // input buffer
|
__m512i b[16]; // input buffer
|
||||||
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
|
|||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
size_t c; // pointer for b[]
|
size_t c; // pointer for b[]
|
||||||
size_t outlen; // digest size
|
size_t outlen; // digest size
|
||||||
} blake2b_8way_ctx;
|
} blake2b_8x64_ctx;
|
||||||
|
|
||||||
int blake2b_8way_init( blake2b_8way_ctx *ctx );
|
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
|
||||||
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
|
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
|
||||||
size_t inlen );
|
size_t inlen );
|
||||||
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
|
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
|
||||||
|
|
||||||
|
#define blake2b_8way_ctx blake2b_8x64_ctx
|
||||||
|
#define blake2b_8way_init blake2b_8x64_init
|
||||||
|
#define blake2b_8way_update blake2b_8x64_update
|
||||||
|
#define blake2b_8way_final blake2b_8x64_final
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
|
|||||||
uint64_t t[2]; // total number of bytes
|
uint64_t t[2]; // total number of bytes
|
||||||
size_t c; // pointer for b[]
|
size_t c; // pointer for b[]
|
||||||
size_t outlen; // digest size
|
size_t outlen; // digest size
|
||||||
} blake2b_4way_ctx;
|
} blake2b_4x64_ctx;
|
||||||
|
|
||||||
int blake2b_4way_init( blake2b_4way_ctx *ctx );
|
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
|
||||||
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
|
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
|
||||||
size_t inlen );
|
size_t inlen );
|
||||||
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
|
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
|
||||||
|
|
||||||
|
#define blake2b_4way_ctx blake2b_4x64_ctx
|
||||||
|
#define blake2b_4way_init blake2b_4x64_init
|
||||||
|
#define blake2b_4way_update blake2b_4x64_update
|
||||||
|
#define blake2b_4way_final blake2b_4x64_final
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -3,7 +3,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "blake2b-hash.h"
|
#include "blake2b-hash.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKE2B_8WAY
|
#define BLAKE2B_8WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKE2B_4WAY
|
#define BLAKE2B_4WAY
|
||||||
|
@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Blake2s-256 16 way
|
// Blake2s-256 16 way
|
||||||
|
|
||||||
|
@@ -11,8 +11,8 @@
|
|||||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||||
*/
|
*/
|
||||||
//#pragma once
|
//#pragma once
|
||||||
#ifndef __BLAKE2S_HASH_4WAY_H__
|
#ifndef BLAKE2S_HASH_4WAY_H__
|
||||||
#define __BLAKE2S_HASH_4WAY_H__ 1
|
#define BLAKE2S_HASH_4WAY_H__ 1
|
||||||
|
|
||||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
@@ -29,20 +29,20 @@
|
|||||||
#define ALIGN(x) __attribute__((aligned(x)))
|
#define ALIGN(x) __attribute__((aligned(x)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct __blake2s_nway_param
|
typedef struct __blake2s_nway_param
|
||||||
{
|
{
|
||||||
uint8_t digest_length; // 1
|
uint8_t digest_length; // 1
|
||||||
uint8_t key_length; // 2
|
uint8_t key_length; // 2
|
||||||
uint8_t fanout; // 3
|
uint8_t fanout; // 3
|
||||||
uint8_t depth; // 4
|
uint8_t depth; // 4
|
||||||
uint32_t leaf_length; // 8
|
uint32_t leaf_length; // 8
|
||||||
uint8_t node_offset[6];// 14
|
uint8_t node_offset[6];// 14
|
||||||
uint8_t node_depth; // 15
|
uint8_t node_depth; // 15
|
||||||
uint8_t inner_length; // 16
|
uint8_t inner_length; // 16
|
||||||
// uint8_t reserved[0];
|
// uint8_t reserved[0];
|
||||||
uint8_t salt[8]; // 24
|
uint8_t salt[8]; // 24
|
||||||
uint8_t personal[8]; // 32
|
uint8_t personal[8]; // 32
|
||||||
} blake2s_nway_param;
|
} blake2s_nway_param;
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
typedef struct ALIGN( 64 ) __blake2s_4way_state
|
||||||
{
|
{
|
||||||
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
|
|||||||
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
typedef struct ALIGN( 64 ) __blake2s_8way_state
|
||||||
{
|
{
|
||||||
__m256i h[8];
|
__m256i h[8];
|
||||||
uint8_t buf[ 32 * 8 ];
|
uint8_t buf[ 64 * 8 ];
|
||||||
uint32_t t[2];
|
uint32_t t[2];
|
||||||
uint32_t f[2];
|
uint32_t f[2];
|
||||||
size_t buflen;
|
size_t buflen;
|
||||||
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
typedef struct ALIGN( 64 ) __blake2s_16way_state
|
||||||
{
|
{
|
||||||
__m512i h[8];
|
__m512i h[8];
|
||||||
uint8_t buf[ 32 * 16 ];
|
uint8_t buf[ 64 * 16 ];
|
||||||
uint32_t t[2];
|
uint32_t t[2];
|
||||||
uint32_t f[2];
|
uint32_t f[2];
|
||||||
size_t buflen;
|
size_t buflen;
|
||||||
|
@@ -3,7 +3,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKE2S_16WAY
|
#define BLAKE2S_16WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKE2S_8WAY
|
#define BLAKE2S_8WAY
|
||||||
|
@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
|
|||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
|
||||||
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
|
Vb = v128_ror64xor( Vb, Vc, 25 ); \
|
||||||
\
|
\
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
|
||||||
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
|
Vb = v128_ror64xor( Vb, Vc, 11 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLAKE512_ROUND( R ) \
|
#define BLAKE512_ROUND( R ) \
|
||||||
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
|
|||||||
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 32 ); \
|
d = v128_ror64xor( d, a, 32 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 25 ); \
|
b = v128_ror64xor( b, c, 25 ); \
|
||||||
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
d = v128_ror64xor( d, a, 16 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 11 ); \
|
b = v128_ror64xor( b, c, 11 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ROUND_B_2X64(r) \
|
#define ROUND_B_2X64(r) \
|
||||||
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
|
|||||||
// G4 skip nonce
|
// G4 skip nonce
|
||||||
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
|
||||||
V0 );
|
V0 );
|
||||||
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
|
VF = v128_ror64xor( VF, V0, 32 );
|
||||||
VA = v128_add64( VA, VF );
|
VA = v128_add64( VA, VF );
|
||||||
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
|
V5 = v128_ror64xor( V5, VA, 25 );
|
||||||
V0 = v128_add64( V0, V5 );
|
V0 = v128_add64( V0, V5 );
|
||||||
|
|
||||||
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
|
||||||
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
|||||||
|
|
||||||
// finish round 0, with the nonce now available
|
// finish round 0, with the nonce now available
|
||||||
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
|
||||||
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
|
VF = v128_ror64xor( VF, V0, 16 );
|
||||||
VA = v128_add64( VA, VF );
|
VA = v128_add64( VA, VF );
|
||||||
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
|
V5 = v128_ror64xor( V5, VA, 11 );
|
||||||
|
|
||||||
// Round 1
|
// Round 1
|
||||||
// G0
|
// G0
|
||||||
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
|
|||||||
|
|
||||||
// G1
|
// G1
|
||||||
V1 = v128_add64( V1, V5 );
|
V1 = v128_add64( V1, V5 );
|
||||||
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
|
VD = v128_ror64xor( VD, V1, 32 );
|
||||||
V9 = v128_add64( V9, VD );
|
V9 = v128_add64( V9, VD );
|
||||||
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
|
V5 = v128_ror64xor( V5, V9, 25 );
|
||||||
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
|
||||||
V5 ) );
|
V5 ) );
|
||||||
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
|
VD = v128_ror64xor( VD, V1, 16 );
|
||||||
V9 = v128_add64( V9, VD );
|
V9 = v128_add64( V9, VD );
|
||||||
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
|
V5 = v128_ror64xor( V5, V9, 11 );
|
||||||
|
|
||||||
// G2
|
// G2
|
||||||
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
|
||||||
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
|
VE = v128_ror64xor( VE, V2, 32 );
|
||||||
VA = v128_add64( VA, VE );
|
VA = v128_add64( VA, VE );
|
||||||
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
|
V6 = v128_ror64xor( V6, VA, 25 );
|
||||||
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
|
||||||
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
|
VE = v128_ror64xor( VE, V2, 16 );
|
||||||
VA = v128_add64( VA, VE );
|
VA = v128_add64( VA, VE );
|
||||||
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
|
V6 = v128_ror64xor( V6, VA, 11 );
|
||||||
|
|
||||||
// G3
|
// G3
|
||||||
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
|
VF = v128_ror64xor( VF, V3, 32 );
|
||||||
VB = v128_add64( VB, VF );
|
VB = v128_add64( VB, VF );
|
||||||
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
|
V7 = v128_ror64xor( V7, VB, 25 );
|
||||||
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
|
||||||
V7 ) );
|
V7 ) );
|
||||||
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
|
VF = v128_ror64xor( VF, V3, 16 );
|
||||||
VB = v128_add64( VB, VF );
|
VB = v128_add64( VB, VF );
|
||||||
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
|
V7 = v128_ror64xor( V7, VB, 11 );
|
||||||
|
|
||||||
// G4, G5, G6, G7
|
// G4, G5, G6, G7
|
||||||
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
|
||||||
|
@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
|||||||
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
||||||
#define blake512_4way_final_le blake512_4x64_final_le
|
#define blake512_4way_final_le blake512_4x64_final_le
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
//
|
//
|
||||||
|
@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
|||||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
blakecoin_4way_hash( hash, vdata );
|
blakecoin_4way_hash( hash, vdata );
|
||||||
|
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BLAKECOIN_16WAY
|
#define BLAKECOIN_16WAY
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BLAKECOIN_8WAY
|
#define BLAKECOIN_8WAY
|
||||||
|
@@ -101,15 +101,15 @@
|
|||||||
{ \
|
{ \
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
Vd = v128_ror64xor( Vd, Va, 32 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
|
Vb = v128_ror64xor( Vb, Vc, 24 ); \
|
||||||
\
|
\
|
||||||
Va = v128_add64( Va, v128_add64( Vb, \
|
Va = v128_add64( Va, v128_add64( Vb, \
|
||||||
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
Vd = v128_ror64xor( Vd, Va, 16 ); \
|
||||||
Vc = v128_add64( Vc, Vd ); \
|
Vc = v128_add64( Vc, Vd ); \
|
||||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
Vb = v128_ror64xor( Vb, Vc, 63 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLAKE2B_ROUND( R ) \
|
#define BLAKE2B_ROUND( R ) \
|
||||||
@@ -131,47 +131,7 @@
|
|||||||
V[7] = v128_alignr64( V6, V7, 1 ); \
|
V[7] = v128_alignr64( V6, V7, 1 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
#elif defined(__SSE2__)
|
|
||||||
// always true
|
|
||||||
|
|
||||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
|
||||||
{ \
|
|
||||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
|
||||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
|
||||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
|
||||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
|
||||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
|
||||||
\
|
|
||||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
|
||||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
|
||||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
|
||||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
|
||||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define BLAKE2B_ROUND( R ) \
|
|
||||||
{ \
|
|
||||||
v128_t *V = (v128_t*)v; \
|
|
||||||
v128_t V2, V3, V6, V7; \
|
|
||||||
const uint8_t *sigmaR = sigma[R]; \
|
|
||||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
|
||||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
|
||||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
|
||||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
|
||||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
|
||||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
|
||||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
|
||||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
|
||||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
|
||||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
|
||||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
|
||||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// never used, SSE2 is always available
|
|
||||||
|
|
||||||
#ifndef ROTR64
|
#ifndef ROTR64
|
||||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||||
|
@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-256 16 way 32
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-512 64 bit 8 way
|
// BMW-512 64 bit 8 way
|
||||||
typedef struct
|
typedef struct
|
||||||
|
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define ss0(x) \
|
#define ss0(x) \
|
||||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||||
_mm_slli_epi32( (x), 3) ), \
|
v128_sl32( (x), 3) ), \
|
||||||
_mm_xor_si128( mm128_rol_32( (x), 4), \
|
v128_xor( v128_rol32( (x), 4), \
|
||||||
mm128_rol_32( (x), 19) ) )
|
v128_rol32( (x), 19) ) )
|
||||||
|
|
||||||
#define ss1(x) \
|
#define ss1(x) \
|
||||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||||
_mm_slli_epi32( (x), 2) ), \
|
v128_sl32( (x), 2) ), \
|
||||||
_mm_xor_si128( mm128_rol_32( (x), 8), \
|
v128_xor( v128_rol32( (x), 8), \
|
||||||
mm128_rol_32( (x), 23) ) )
|
v128_rol32( (x), 23) ) )
|
||||||
|
|
||||||
#define ss2(x) \
|
#define ss2(x) \
|
||||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||||
_mm_slli_epi32( (x), 1) ), \
|
v128_sl32( (x), 1) ), \
|
||||||
_mm_xor_si128( mm128_rol_32( (x), 12), \
|
v128_xor( v128_rol32( (x), 12), \
|
||||||
mm128_rol_32( (x), 25) ) )
|
v128_rol32( (x), 25) ) )
|
||||||
|
|
||||||
#define ss3(x) \
|
#define ss3(x) \
|
||||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||||
_mm_slli_epi32( (x), 2) ), \
|
v128_sl32( (x), 2) ), \
|
||||||
_mm_xor_si128( mm128_rol_32( (x), 15), \
|
v128_xor( v128_rol32( (x), 15), \
|
||||||
mm128_rol_32( (x), 29) ) )
|
v128_rol32( (x), 29) ) )
|
||||||
|
|
||||||
#define ss4(x) \
|
#define ss4(x) \
|
||||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
|
v128_xor( (x), v128_sr32( (x), 1 ) )
|
||||||
|
|
||||||
#define ss5(x) \
|
#define ss5(x) \
|
||||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
|
v128_xor( (x), v128_sr32( (x), 2 ) )
|
||||||
|
|
||||||
#define rs1(x) mm128_rol_32( x, 3 )
|
#define rs1(x) v128_rol32( x, 3 )
|
||||||
#define rs2(x) mm128_rol_32( x, 7 )
|
#define rs2(x) v128_rol32( x, 7 )
|
||||||
#define rs3(x) mm128_rol_32( x, 13 )
|
#define rs3(x) v128_rol32( x, 13 )
|
||||||
#define rs4(x) mm128_rol_32( x, 16 )
|
#define rs4(x) v128_rol32( x, 16 )
|
||||||
#define rs5(x) mm128_rol_32( x, 19 )
|
#define rs5(x) v128_rol32( x, 19 )
|
||||||
#define rs6(x) mm128_rol_32( x, 23 )
|
#define rs6(x) v128_rol32( x, 23 )
|
||||||
#define rs7(x) mm128_rol_32( x, 27 )
|
#define rs7(x) v128_rol32( x, 27 )
|
||||||
|
|
||||||
#define rol_off_32( M, j, off ) \
|
#define rol_off_32( M, j, off ) \
|
||||||
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||||
|
|
||||||
#define add_elt_s( M, H, j ) \
|
#define add_elt_s( M, H, j ) \
|
||||||
_mm_xor_si128( \
|
v128_xor( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
|
||||||
rol_off_32( M, j, 3 ) ), \
|
rol_off_32( M, j, 3 ) ), \
|
||||||
rol_off_32( M, j, 10 ) ), \
|
rol_off_32( M, j, 10 ) ), \
|
||||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||||
H[ ( (j)+7 ) & 0xF ] )
|
H[ ( (j)+7 ) & 0xF ] )
|
||||||
|
|
||||||
|
|
||||||
#define expand1s( qt, M, H, i ) \
|
#define expand1s( qt, M, H, i ) \
|
||||||
_mm_add_epi32( mm128_add4_32( \
|
v128_add32( v128_add4_32( \
|
||||||
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||||
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
||||||
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||||
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
||||||
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||||
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
||||||
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||||
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
||||||
add_elt_s( M, H, (i)-16 ) )
|
add_elt_s( M, H, (i)-16 ) )
|
||||||
|
|
||||||
#define expand2s( qt, M, H, i) \
|
#define expand2s( qt, M, H, i) \
|
||||||
_mm_add_epi32( mm128_add4_32( \
|
v128_add32( v128_add4_32( \
|
||||||
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||||
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
||||||
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||||
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
||||||
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||||
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
||||||
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||||
add_elt_s( M, H, (i)-16 ) )
|
add_elt_s( M, H, (i)-16 ) )
|
||||||
|
|
||||||
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
|
|||||||
// resulting in some sign changes compared to the reference code.
|
// resulting in some sign changes compared to the reference code.
|
||||||
|
|
||||||
#define Ws0 \
|
#define Ws0 \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
v128_xor( M[ 7], H[ 7] ) ), \
|
||||||
_mm_xor_si128( M[10], H[10] ) ), \
|
v128_xor( M[10], H[10] ) ), \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
|
v128_add32( v128_xor( M[13], H[13] ), \
|
||||||
_mm_xor_si128( M[14], H[14] ) ) )
|
v128_xor( M[14], H[14] ) ) )
|
||||||
|
|
||||||
#define Ws1 \
|
#define Ws1 \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
|
||||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
v128_xor( M[ 8], H[ 8] ) ), \
|
||||||
_mm_xor_si128( M[11], H[11] ) ), \
|
v128_xor( M[11], H[11] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
|
v128_sub32( v128_xor( M[14], H[14] ), \
|
||||||
_mm_xor_si128( M[15], H[15] ) ) )
|
v128_xor( M[15], H[15] ) ) )
|
||||||
|
|
||||||
#define Ws2 \
|
#define Ws2 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
v128_add32( v128_xor( M[ 0], H[ 0] ), \
|
||||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
v128_xor( M[ 7], H[ 7] ) ), \
|
||||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
v128_xor( M[ 9], H[ 9] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||||
_mm_xor_si128( M[15], H[15] ) ) )
|
v128_xor( M[15], H[15] ) ) )
|
||||||
|
|
||||||
#define Ws3 \
|
#define Ws3 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
v128_xor( M[ 1], H[ 1] ) ), \
|
||||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
v128_xor( M[ 8], H[ 8] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
|
v128_sub32( v128_xor( M[10], H[10] ), \
|
||||||
_mm_xor_si128( M[13], H[13] ) ) )
|
v128_xor( M[13], H[13] ) ) )
|
||||||
|
|
||||||
#define Ws4 \
|
#define Ws4 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
v128_xor( M[ 2], H[ 2] ) ), \
|
||||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
v128_xor( M[ 9], H[ 9] ) ), \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
v128_add32( v128_xor( M[11], H[11] ), \
|
||||||
_mm_xor_si128( M[14], H[14] ) ) )
|
v128_xor( M[14], H[14] ) ) )
|
||||||
|
|
||||||
#define Ws5 \
|
#define Ws5 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
v128_xor( M[ 2], H[ 2] ) ), \
|
||||||
_mm_xor_si128( M[10], H[10] ) ), \
|
v128_xor( M[10], H[10] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||||
_mm_xor_si128( M[15], H[15] ) ) )
|
v128_xor( M[15], H[15] ) ) )
|
||||||
|
|
||||||
#define Ws6 \
|
#define Ws6 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
|
||||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
v128_xor( M[ 0], H[ 0] ) ), \
|
||||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
v128_xor( M[ 3], H[ 3] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
|
v128_sub32( v128_xor( M[11], H[11] ), \
|
||||||
_mm_xor_si128( M[13], H[13] ) ) )
|
v128_xor( M[13], H[13] ) ) )
|
||||||
|
|
||||||
#define Ws7 \
|
#define Ws7 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
|
||||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
v128_xor( M[ 4], H[ 4] ) ), \
|
||||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
v128_xor( M[ 5], H[ 5] ) ), \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
|
v128_add32( v128_xor( M[12], H[12] ), \
|
||||||
_mm_xor_si128( M[14], H[14] ) ) )
|
v128_xor( M[14], H[14] ) ) )
|
||||||
|
|
||||||
#define Ws8 \
|
#define Ws8 \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
|
||||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
v128_xor( M[ 5], H[ 5] ) ), \
|
||||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
v128_xor( M[ 6], H[ 6] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
|
v128_sub32( v128_xor( M[13], H[13] ), \
|
||||||
_mm_xor_si128( M[15], H[15] ) ) )
|
v128_xor( M[15], H[15] ) ) )
|
||||||
#define Ws9 \
|
#define Ws9 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
v128_xor( M[ 3], H[ 3] ) ), \
|
||||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
v128_xor( M[ 6], H[ 6] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||||
_mm_xor_si128( M[14], H[14] ) ) )
|
v128_xor( M[14], H[14] ) ) )
|
||||||
|
|
||||||
#define Ws10 \
|
#define Ws10 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
v128_xor( M[ 1], H[ 1] ) ), \
|
||||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
v128_xor( M[ 4], H[ 4] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||||
_mm_xor_si128( M[15], H[15] ) ) )
|
v128_xor( M[15], H[15] ) ) )
|
||||||
|
|
||||||
#define Ws11 \
|
#define Ws11 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
v128_xor( M[ 0], H[ 0] ) ), \
|
||||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
v128_xor( M[ 2], H[ 2] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||||
_mm_xor_si128( M[ 9], H[ 9] ) ) )
|
v128_xor( M[ 9], H[ 9] ) ) )
|
||||||
|
|
||||||
#define Ws12 \
|
#define Ws12 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
v128_xor( M[ 3], H[ 3] ) ), \
|
||||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
v128_xor( M[ 6], H[ 6] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||||
_mm_xor_si128( M[10], H[10] ) ) )
|
v128_xor( M[10], H[10] ) ) )
|
||||||
|
|
||||||
#define Ws13 \
|
#define Ws13 \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
v128_add32( v128_xor( M[ 2], H[ 2] ), \
|
||||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
v128_xor( M[ 4], H[ 4] ) ), \
|
||||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
v128_xor( M[ 7], H[ 7] ) ), \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
|
v128_add32( v128_xor( M[10], H[10] ), \
|
||||||
_mm_xor_si128( M[11], H[11] ) ) )
|
v128_xor( M[11], H[11] ) ) )
|
||||||
|
|
||||||
#define Ws14 \
|
#define Ws14 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_add_epi32( \
|
v128_add32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
v128_xor( M[ 5], H[ 5] ) ), \
|
||||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
v128_xor( M[ 8], H[ 8] ) ), \
|
||||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
v128_add32( v128_xor( M[11], H[11] ), \
|
||||||
_mm_xor_si128( M[12], H[12] ) ) )
|
v128_xor( M[12], H[12] ) ) )
|
||||||
|
|
||||||
#define Ws15 \
|
#define Ws15 \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( \
|
v128_sub32( \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
v128_xor( M[ 4], H[4] ) ), \
|
||||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
v128_xor( M[ 6], H[ 6] ) ), \
|
||||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||||
_mm_xor_si128( M[13], H[13] ) ) )
|
v128_xor( M[13], H[13] ) ) )
|
||||||
|
|
||||||
|
|
||||||
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
|
||||||
{
|
{
|
||||||
__m128i qt[32], xl, xh; \
|
v128u64_t qt[32], xl, xh; \
|
||||||
|
|
||||||
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
|
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
||||||
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
|
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
||||||
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
|
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
|
||||||
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
|
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
|
||||||
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
|
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
|
||||||
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
|
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
|
||||||
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
|
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
|
||||||
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
|
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
|
||||||
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
|
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
|
||||||
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
|
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
|
||||||
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
|
qt[10] = v128_add32( ss0( Ws10), H[11] );
|
||||||
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
|
qt[11] = v128_add32( ss1( Ws11), H[12] );
|
||||||
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
|
qt[12] = v128_add32( ss2( Ws12), H[13] );
|
||||||
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
|
qt[13] = v128_add32( ss3( Ws13), H[14] );
|
||||||
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
|
qt[14] = v128_add32( ss4( Ws14), H[15] );
|
||||||
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
|
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
|
||||||
qt[16] = expand1s( qt, M, H, 16 );
|
qt[16] = expand1s( qt, M, H, 16 );
|
||||||
qt[17] = expand1s( qt, M, H, 17 );
|
qt[17] = expand1s( qt, M, H, 17 );
|
||||||
qt[18] = expand2s( qt, M, H, 18 );
|
qt[18] = expand2s( qt, M, H, 18 );
|
||||||
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
|||||||
qt[30] = expand2s( qt, M, H, 30 );
|
qt[30] = expand2s( qt, M, H, 30 );
|
||||||
qt[31] = expand2s( qt, M, H, 31 );
|
qt[31] = expand2s( qt, M, H, 31 );
|
||||||
|
|
||||||
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||||
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||||
xh = _mm_xor_si128( xl, _mm_xor_si128(
|
xh = v128_xor( xl, v128_xor(
|
||||||
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||||
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||||
|
|
||||||
dH[ 0] = _mm_add_epi32(
|
dH[ 0] = v128_add32(
|
||||||
_mm_xor_si128( M[0],
|
v128_xor( M[0],
|
||||||
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
|
v128_xor( v128_sl32( xh, 5 ),
|
||||||
_mm_srli_epi32( qt[16], 5 ) ) ),
|
v128_sr32( qt[16], 5 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
|
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
|
||||||
dH[ 1] = _mm_add_epi32(
|
dH[ 1] = v128_add32(
|
||||||
_mm_xor_si128( M[1],
|
v128_xor( M[1],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
|
v128_xor( v128_sr32( xh, 7 ),
|
||||||
_mm_slli_epi32( qt[17], 8 ) ) ),
|
v128_sl32( qt[17], 8 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
|
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
|
||||||
dH[ 2] = _mm_add_epi32(
|
dH[ 2] = v128_add32(
|
||||||
_mm_xor_si128( M[2],
|
v128_xor( M[2],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
|
v128_xor( v128_sr32( xh, 5 ),
|
||||||
_mm_slli_epi32( qt[18], 5 ) ) ),
|
v128_sl32( qt[18], 5 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
|
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
|
||||||
dH[ 3] = _mm_add_epi32(
|
dH[ 3] = v128_add32(
|
||||||
_mm_xor_si128( M[3],
|
v128_xor( M[3],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
|
v128_xor( v128_sr32( xh, 1 ),
|
||||||
_mm_slli_epi32( qt[19], 5 ) ) ),
|
v128_sl32( qt[19], 5 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
|
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
|
||||||
dH[ 4] = _mm_add_epi32(
|
dH[ 4] = v128_add32(
|
||||||
_mm_xor_si128( M[4],
|
v128_xor( M[4],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
|
v128_xor( v128_sr32( xh, 3 ),
|
||||||
_mm_slli_epi32( qt[20], 0 ) ) ),
|
v128_sl32( qt[20], 0 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
|
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
|
||||||
dH[ 5] = _mm_add_epi32(
|
dH[ 5] = v128_add32(
|
||||||
_mm_xor_si128( M[5],
|
v128_xor( M[5],
|
||||||
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
|
v128_xor( v128_sl32( xh, 6 ),
|
||||||
_mm_srli_epi32( qt[21], 6 ) ) ),
|
v128_sr32( qt[21], 6 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
|
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
|
||||||
dH[ 6] = _mm_add_epi32(
|
dH[ 6] = v128_add32(
|
||||||
_mm_xor_si128( M[6],
|
v128_xor( M[6],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
|
v128_xor( v128_sr32( xh, 4 ),
|
||||||
_mm_slli_epi32( qt[22], 6 ) ) ),
|
v128_sl32( qt[22], 6 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
|
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
|
||||||
dH[ 7] = _mm_add_epi32(
|
dH[ 7] = v128_add32(
|
||||||
_mm_xor_si128( M[7],
|
v128_xor( M[7],
|
||||||
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
|
v128_xor( v128_sr32( xh, 11 ),
|
||||||
_mm_slli_epi32( qt[23], 2 ) ) ),
|
v128_sl32( qt[23], 2 ) ) ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
|
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
|
||||||
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
|
dH[ 8] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[4], 9 ),
|
v128_rol32( dH[4], 9 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
|
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
|
||||||
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
|
v128_xor( v128_sl32( xl, 8 ),
|
||||||
_mm_xor_si128( qt[23], qt[ 8] ) ) );
|
v128_xor( qt[23], qt[ 8] ) ) );
|
||||||
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
|
dH[ 9] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[5], 10 ),
|
v128_rol32( dH[5], 10 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
|
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
|
||||||
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
|
v128_xor( v128_sr32( xl, 6 ),
|
||||||
_mm_xor_si128( qt[16], qt[ 9] ) ) );
|
v128_xor( qt[16], qt[ 9] ) ) );
|
||||||
dH[10] = _mm_add_epi32( _mm_add_epi32(
|
dH[10] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[6], 11 ),
|
v128_rol32( dH[6], 11 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
|
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
|
||||||
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
|
v128_xor( v128_sl32( xl, 6 ),
|
||||||
_mm_xor_si128( qt[17], qt[10] ) ) );
|
v128_xor( qt[17], qt[10] ) ) );
|
||||||
dH[11] = _mm_add_epi32( _mm_add_epi32(
|
dH[11] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[7], 12 ),
|
v128_rol32( dH[7], 12 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
|
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
|
||||||
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
|
v128_xor( v128_sl32( xl, 4 ),
|
||||||
_mm_xor_si128( qt[18], qt[11] ) ) );
|
v128_xor( qt[18], qt[11] ) ) );
|
||||||
dH[12] = _mm_add_epi32( _mm_add_epi32(
|
dH[12] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[0], 13 ),
|
v128_rol32( dH[0], 13 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
|
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
|
||||||
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
|
v128_xor( v128_sr32( xl, 3 ),
|
||||||
_mm_xor_si128( qt[19], qt[12] ) ) );
|
v128_xor( qt[19], qt[12] ) ) );
|
||||||
dH[13] = _mm_add_epi32( _mm_add_epi32(
|
dH[13] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[1], 14 ),
|
v128_rol32( dH[1], 14 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
|
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
|
||||||
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
|
v128_xor( v128_sr32( xl, 4 ),
|
||||||
_mm_xor_si128( qt[20], qt[13] ) ) );
|
v128_xor( qt[20], qt[13] ) ) );
|
||||||
dH[14] = _mm_add_epi32( _mm_add_epi32(
|
dH[14] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[2], 15 ),
|
v128_rol32( dH[2], 15 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
|
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
|
||||||
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
|
v128_xor( v128_sr32( xl, 7 ),
|
||||||
_mm_xor_si128( qt[21], qt[14] ) ) );
|
v128_xor( qt[21], qt[14] ) ) );
|
||||||
dH[15] = _mm_add_epi32( _mm_add_epi32(
|
dH[15] = v128_add32( v128_add32(
|
||||||
mm128_rol_32( dH[3], 16 ),
|
v128_rol32( dH[3], 16 ),
|
||||||
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
|
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
|
||||||
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
|
v128_xor( v128_sr32( xl, 2 ),
|
||||||
_mm_xor_si128( qt[22], qt[15] ) ) );
|
v128_xor( qt[22], qt[15] ) ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static const uint32_t final_s[16][4] =
|
static const uint32_t final_s[16][4] =
|
||||||
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
|
|||||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||||
};
|
};
|
||||||
/*
|
/*
|
||||||
static const __m128i final_s[16] =
|
static const v128u64_t final_s[16] =
|
||||||
{
|
{
|
||||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||||
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
|
|||||||
*/
|
*/
|
||||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||||
{
|
{
|
||||||
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
|
ctx->H[ 0] = v128_64( 0x4041424340414243 );
|
||||||
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
|
ctx->H[ 1] = v128_64( 0x4445464744454647 );
|
||||||
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
|
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
|
||||||
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
|
||||||
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
|
ctx->H[ 4] = v128_64( 0x5051525350515253 );
|
||||||
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
|
ctx->H[ 5] = v128_64( 0x5455565754555657 );
|
||||||
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
|
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
|
||||||
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
|
||||||
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
|
ctx->H[ 8] = v128_64( 0x6061626360616263 );
|
||||||
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
|
ctx->H[ 9] = v128_64( 0x6465666764656667 );
|
||||||
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
|
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
|
||||||
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
|
||||||
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
|
ctx->H[12] = v128_64( 0x7071727370717273 );
|
||||||
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
|
ctx->H[13] = v128_64( 0x7475767774757677 );
|
||||||
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
|
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
|
||||||
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
|
||||||
|
|
||||||
|
|
||||||
// for ( int i = 0; i < 16; i++ )
|
// for ( int i = 0; i < 16; i++ )
|
||||||
// sc->H[i] = _mm_set1_epi32( iv[i] );
|
// sc->H[i] = v128_32( iv[i] );
|
||||||
ctx->ptr = 0;
|
ctx->ptr = 0;
|
||||||
ctx->bit_count = 0;
|
ctx->bit_count = 0;
|
||||||
}
|
}
|
||||||
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
|
|||||||
static void
|
static void
|
||||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
__m128i *vdata = (__m128i*)data;
|
v128u64_t *vdata = (v128u64_t*)data;
|
||||||
__m128i *buf;
|
v128u64_t *buf;
|
||||||
__m128i htmp[16];
|
v128u64_t htmp[16];
|
||||||
__m128i *h1, *h2;
|
v128u64_t *h1, *h2;
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
|||||||
clen = buf_size - ptr;
|
clen = buf_size - ptr;
|
||||||
if ( clen > len )
|
if ( clen > len )
|
||||||
clen = len;
|
clen = len;
|
||||||
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
|
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
|
||||||
vdata += ( clen >> 2 );
|
vdata += ( clen >> 2 );
|
||||||
len -= clen;
|
len -= clen;
|
||||||
ptr += clen;
|
ptr += clen;
|
||||||
if ( ptr == buf_size )
|
if ( ptr == buf_size )
|
||||||
{
|
{
|
||||||
__m128i *ht;
|
v128u64_t *ht;
|
||||||
compress_small( buf, h1, h2 );
|
compress_small( buf, h1, h2 );
|
||||||
ht = h1;
|
ht = h1;
|
||||||
h1 = h2;
|
h1 = h2;
|
||||||
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
|||||||
}
|
}
|
||||||
sc->ptr = ptr;
|
sc->ptr = ptr;
|
||||||
|
|
||||||
|
|
||||||
if ( h1 != sc->H )
|
if ( h1 != sc->H )
|
||||||
memcpy_128( sc->H, h1, 16 );
|
v128_memcpy( sc->H, h1, 16 );
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||||
void *dst, size_t out_size_w32)
|
void *dst, size_t out_size_w32)
|
||||||
{
|
{
|
||||||
__m128i *buf;
|
v128u64_t *buf;
|
||||||
__m128i h1[16], h2[16], *h;
|
v128u64_t h1[16], h2[16], *h;
|
||||||
size_t ptr, u, v;
|
size_t ptr, u, v;
|
||||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
buf = sc->buf;
|
buf = sc->buf;
|
||||||
ptr = sc->ptr;
|
ptr = sc->ptr;
|
||||||
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||||
ptr += 4;
|
ptr += 4;
|
||||||
h = sc->H;
|
h = sc->H;
|
||||||
|
|
||||||
// assume bit_count fits in 32 bits
|
// assume bit_count fits in 32 bits
|
||||||
if ( ptr > buf_size - 4 )
|
if ( ptr > buf_size - 4 )
|
||||||
{
|
{
|
||||||
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
compress_small( buf, h, h1 );
|
compress_small( buf, h, h1 );
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
h = h1;
|
h = h1;
|
||||||
}
|
}
|
||||||
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||||
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
|
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
|
||||||
buf[ (buf_size - 4) >> 2 ] = m128_zero;
|
buf[ (buf_size - 4) >> 2 ] = v128_zero;
|
||||||
compress_small( buf, h, h2 );
|
compress_small( buf, h, h2 );
|
||||||
|
|
||||||
for ( u = 0; u < 16; u ++ )
|
for ( u = 0; u < 16; u ++ )
|
||||||
buf[u] = h2[u];
|
buf[u] = h2[u];
|
||||||
|
|
||||||
compress_small( buf, (__m128i*)final_s, h1 );
|
compress_small( buf, (v128u64_t*)final_s, h1 );
|
||||||
|
|
||||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||||
casti_m128i( dst, u ) = h1[v];
|
casti_v128( dst, u ) = h1[v];
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1058,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-256 16 way 32
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
|
@@ -2,12 +2,11 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
//#include "sph_keccak.h"
|
|
||||||
#include "bmw-hash-4way.h"
|
#include "bmw-hash-4way.h"
|
||||||
|
|
||||||
#if defined(BMW512_8WAY)
|
#if defined(BMW512_8WAY)
|
||||||
|
|
||||||
void bmw512hash_8way(void *state, const void *input)
|
void bmw512hash_8way( void *state, const void *input )
|
||||||
{
|
{
|
||||||
bmw512_8way_context ctx;
|
bmw512_8way_context ctx;
|
||||||
bmw512_8way_init( &ctx );
|
bmw512_8way_init( &ctx );
|
||||||
@@ -27,9 +26,9 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 8;
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
__m512i *noncev = (__m512i*)vdata + 9;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
do {
|
do {
|
||||||
@@ -43,7 +42,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
|||||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||||
{
|
{
|
||||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||||
if ( fulltest( lane_hash, ptarget ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||||
{
|
{
|
||||||
pdata[19] = n + lane;
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
@@ -59,9 +58,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
#elif defined(BMW512_4WAY)
|
#elif defined(BMW512_4WAY)
|
||||||
|
|
||||||
//#ifdef BMW512_4WAY
|
void bmw512hash_4way( void *state, const void *input )
|
||||||
|
|
||||||
void bmw512hash_4way(void *state, const void *input)
|
|
||||||
{
|
{
|
||||||
bmw512_4way_context ctx;
|
bmw512_4way_context ctx;
|
||||||
bmw512_4way_init( &ctx );
|
bmw512_4way_init( &ctx );
|
||||||
@@ -80,10 +77,10 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t last_nonce = max_nonce - 4;
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
const int thr_id = mythr->id;
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
do {
|
do {
|
||||||
@@ -96,7 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
|||||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||||
{
|
{
|
||||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||||
if ( fulltest( lane_hash, ptarget ) )
|
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||||
{
|
{
|
||||||
pdata[19] = n + lane;
|
pdata[19] = n + lane;
|
||||||
submit_solution( work, lane_hash, mythr );
|
submit_solution( work, lane_hash, mythr );
|
||||||
@@ -110,4 +107,55 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(BMW512_2WAY)
|
||||||
|
|
||||||
|
void bmw512hash_2x64( void *state, const void *input )
|
||||||
|
{
|
||||||
|
bmw512_2x64_context ctx;
|
||||||
|
bmw512_2x64_init( &ctx );
|
||||||
|
bmw512_2x64_update( &ctx, input, 80 );
|
||||||
|
bmw512_2x64_close( &ctx, state );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 2;
|
||||||
|
v128_t *noncev = (v128_t*)vdata + 9;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
|
||||||
|
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||||
|
do {
|
||||||
|
*noncev = v128_intrlv_blend_32( v128_bswap32(
|
||||||
|
v128_set32( n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
bmw512hash_2x64( hash, vdata );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 2; lane++ )
|
||||||
|
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||||
|
{
|
||||||
|
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||||
|
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ))
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += 2;
|
||||||
|
|
||||||
|
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
bool register_bmw512_algo( algo_gate_t* gate )
|
bool register_bmw512_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
#if defined (BMW512_8WAY)
|
#if defined (BMW512_8WAY)
|
||||||
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
gate->scanhash = (void*)&scanhash_bmw512_8way;
|
||||||
@@ -10,6 +10,9 @@ bool register_bmw512_algo( algo_gate_t* gate )
|
|||||||
#elif defined (BMW512_4WAY)
|
#elif defined (BMW512_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
gate->scanhash = (void*)&scanhash_bmw512_4way;
|
||||||
gate->hash = (void*)&bmw512hash_4way;
|
gate->hash = (void*)&bmw512hash_4way;
|
||||||
|
#elif defined (BMW512_2WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_bmw512_2x64;
|
||||||
|
gate->hash = (void*)&bmw512hash_2x64;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_bmw512;
|
gate->scanhash = (void*)&scanhash_bmw512;
|
||||||
gate->hash = (void*)&bmw512hash;
|
gate->hash = (void*)&bmw512hash;
|
||||||
|
@@ -4,23 +4,31 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define BMW512_8WAY 1
|
#define BMW512_8WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define BMW512_4WAY 1
|
#define BMW512_4WAY 1
|
||||||
|
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
#define BMW512_2WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BMW512_8WAY)
|
#if defined(BMW512_8WAY)
|
||||||
|
|
||||||
void bmw512hash_8way( void *state, const void *input );
|
void bmw512hash_8way( void *state, const void *input );
|
||||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
#elif defined(BMW512_4WAY)
|
#elif defined(BMW512_4WAY)
|
||||||
|
|
||||||
void bmw512hash_4way( void *state, const void *input );
|
void bmw512hash_4way( void *state, const void *input );
|
||||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#elif defined(BMW512_2WAY)
|
||||||
|
|
||||||
|
void bmw512hash_2x64( void *state, const void *input );
|
||||||
|
int scanhash_bmw512_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// BMW-512 8 WAY
|
// BMW-512 8 WAY
|
||||||
|
|
||||||
|
@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
|
|||||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||||
// If reinterleaving is necessary it may be more efficient to use
|
// If reinterleaving is necessary it may be more efficient to use
|
||||||
|
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
struct _cube_4way_context
|
struct _cube_4way_context
|
||||||
{
|
{
|
||||||
|
@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
|
|||||||
int r;
|
int r;
|
||||||
const int rounds = sp->rounds;
|
const int rounds = sp->rounds;
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
register __m512i x0, x1;
|
register __m512i x0, x1;
|
||||||
|
|
||||||
|
@@ -236,9 +236,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HashReturn init_echo( hashState_echo *ctx, int nHashSize )
|
||||||
|
|
||||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
@@ -280,7 +278,8 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
|||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
HashReturn update_echo( hashState_echo *state, const void *data,
|
||||||
|
uint32_t databitlen )
|
||||||
{
|
{
|
||||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||||
|
|
||||||
@@ -330,7 +329,7 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
|
|||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
HashReturn final_echo( hashState_echo *state, void *hashval)
|
||||||
{
|
{
|
||||||
v128_t remainingbits;
|
v128_t remainingbits;
|
||||||
|
|
||||||
@@ -407,8 +406,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
|||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||||
const BitSequence *data, DataLength databitlen )
|
const void *data, uint32_t databitlen )
|
||||||
{
|
{
|
||||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||||
|
|
||||||
@@ -530,8 +529,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
|||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||||
int nHashSize, const BitSequence *data, DataLength datalen )
|
int nHashSize, const void *data, uint32_t datalen )
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
@@ -578,7 +577,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
|||||||
{
|
{
|
||||||
// Fill the buffer
|
// Fill the buffer
|
||||||
memcpy( state->buffer + state->uBufferBytes,
|
memcpy( state->buffer + state->uBufferBytes,
|
||||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
data, state->uBlockLength - state->uBufferBytes );
|
||||||
|
|
||||||
// Process buffer
|
// Process buffer
|
||||||
Compress( state, state->buffer, 1 );
|
Compress( state, state->buffer, 1 );
|
||||||
@@ -601,7 +600,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if( uRemainingBytes > 0 )
|
if( uRemainingBytes > 0 )
|
||||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
memcpy(state->buffer, data, uRemainingBytes);
|
||||||
|
|
||||||
state->uBufferBytes = uRemainingBytes;
|
state->uBufferBytes = uRemainingBytes;
|
||||||
}
|
}
|
||||||
@@ -689,7 +688,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||||
{
|
{
|
||||||
HashReturn hRet;
|
HashReturn hRet;
|
||||||
@@ -746,5 +745,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
|||||||
|
|
||||||
return SUCCESS;
|
return SUCCESS;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -47,16 +47,16 @@ HashReturn init_echo(hashState_echo *state, int hashbitlen);
|
|||||||
|
|
||||||
HashReturn reinit_echo(hashState_echo *state);
|
HashReturn reinit_echo(hashState_echo *state);
|
||||||
|
|
||||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
|
HashReturn update_echo(hashState_echo *state, const void *data, uint32_t databitlen);
|
||||||
|
|
||||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
|
HashReturn final_echo(hashState_echo *state, void *hashval);
|
||||||
|
|
||||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
|
HashReturn hash_echo(int hashbitlen, const void *data, uint32_t databitlen, void *hashval);
|
||||||
|
|
||||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
HashReturn update_final_echo( hashState_echo *state, void *hashval,
|
||||||
const BitSequence *data, DataLength databitlen );
|
const void *data, uint32_t databitlen );
|
||||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
HashReturn echo_full( hashState_echo *state, void *hashval,
|
||||||
int nHashSize, const BitSequence *data, DataLength databitlen );
|
int nHashSize, const void *data, uint32_t databitlen );
|
||||||
|
|
||||||
#endif // HASH_API_H
|
#endif // HASH_API_H
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
|||||||
};
|
};
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define ECHO_SUBBYTES4(state, j) \
|
#define ECHO_SUBBYTES4(state, j) \
|
||||||
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
@@ -36,7 +36,6 @@
|
|||||||
|
|
||||||
#include "sph_echo.h"
|
#include "sph_echo.h"
|
||||||
|
|
||||||
#if !defined(__AES__)
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C"{
|
extern "C"{
|
||||||
@@ -1031,4 +1030,3 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif // !AES
|
|
||||||
|
@@ -36,8 +36,6 @@
|
|||||||
#ifndef SPH_ECHO_H__
|
#ifndef SPH_ECHO_H__
|
||||||
#define SPH_ECHO_H__
|
#define SPH_ECHO_H__
|
||||||
|
|
||||||
#if !defined(__AES__)
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C"{
|
extern "C"{
|
||||||
#endif
|
#endif
|
||||||
@@ -318,5 +316,4 @@ void sph_echo512_addbits_and_close(
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif // !AES
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -15,237 +15,176 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||||
|
|
||||||
#include <x86intrin.h>
|
|
||||||
|
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
#include "fugue-aesni.h"
|
#include "fugue-aesni.h"
|
||||||
|
|
||||||
|
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0202010807020100, 0x0a05000f06010c0b };
|
||||||
|
|
||||||
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
|
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
|
||||||
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
|
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
|
||||||
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
|
|
||||||
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
|
|
||||||
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
|
|
||||||
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
|
|
||||||
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
|
|
||||||
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
|
|
||||||
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
|
|
||||||
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
|
|
||||||
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
|
||||||
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
|
||||||
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
|
||||||
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
|
||||||
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
|
||||||
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
|
||||||
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
|
|
||||||
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
|
|
||||||
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
|
|
||||||
|
|
||||||
|
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0402060c070d0003, 0x090a060580808080 };
|
||||||
|
|
||||||
MYALIGN const unsigned int _IV512[] = {
|
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
|
||||||
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0706050403020000, 0x0302000007060504 };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x010c0b060d080702, 0x0904030e03000104 };
|
||||||
|
|
||||||
|
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x8080808080808080, 0x0504070605040f06 };
|
||||||
|
|
||||||
|
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||||
|
|
||||||
|
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x000000001b1b0000, 0x0000000000000000 };
|
||||||
|
|
||||||
|
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x000000002d361b00, 0x0000000000000000 };
|
||||||
|
|
||||||
|
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0303030303030303, 0x0303030303030303 };
|
||||||
|
|
||||||
|
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
|
||||||
|
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||||
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
|
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
|
||||||
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
|
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
|
||||||
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
|
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
|
||||||
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
|
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
|
||||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
|
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
|
||||||
|
};
|
||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
#define PACK_S0(s0, s1, t1)\
|
#define mask_1000(v) v128_put32( v, 0, 3 )
|
||||||
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
|
|
||||||
|
|
||||||
#define UNPACK_S0(s0, s1, t1)\
|
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
|
||||||
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
|
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||||
s0 = mm128_mask_32( s0, 8 )
|
|
||||||
|
|
||||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
|
||||||
t1 = s1;\
|
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
|
||||||
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
|
|
||||||
r1 = _mm_xor_si128(r1, t1);\
|
|
||||||
r2 = _mm_xor_si128(r2, t1);
|
|
||||||
|
|
||||||
#else // SSE2
|
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
|
||||||
|
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||||
|
|
||||||
#define PACK_S0(s0, s1, t1)\
|
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
|
||||||
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
|
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
|
||||||
s0 = _mm_xor_si128(s0, t1);
|
|
||||||
|
|
||||||
#define UNPACK_S0(s0, s1, t1)\
|
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
|
||||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
|
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
|
||||||
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
|
|
||||||
s0 = mm128_mask_32( s0, 8 )
|
|
||||||
|
|
||||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||||
t1 = _mm_shuffle_epi32(s1, 0xf9);\
|
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
|
||||||
t2 = _mm_shuffle_epi32(s2, 0xcf);\
|
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
|
||||||
t1 = _mm_xor_si128(t1, t2);\
|
t1 = v128_xor( t1, t2 ); \
|
||||||
r1 = _mm_xor_si128(r1, t1);\
|
r1 = v128_xor( r1, t1 ); \
|
||||||
r2 = _mm_xor_si128(r2, t1)
|
r2 = v128_xor( r2, t1 );
|
||||||
|
|
||||||
|
#elif defined(__SSE4_1__)
|
||||||
|
|
||||||
|
#define mask_1000(v) v128_mask32( v, 8 )
|
||||||
|
|
||||||
|
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
|
||||||
|
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
|
||||||
|
|
||||||
|
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||||
|
t1 = s1; \
|
||||||
|
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
|
||||||
|
r1 = v128_xor( r1, t1 ); \
|
||||||
|
r2 = v128_xor( r2, t1 );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
|
#define PACK_S0( s0, s1, t1 ) \
|
||||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
s0 = v128_movlane32( s0, 3, s1, 0 )
|
||||||
s10 = _mm_xor_si128(s10, t1);\
|
|
||||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
|
||||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
|
||||||
t1 = _mm_slli_si128(t1, 8);\
|
|
||||||
s8 = _mm_xor_si128(s8, t1);\
|
|
||||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
|
||||||
s0 = _mm_xor_si128(s0, t1)
|
|
||||||
|
|
||||||
|
#define UNPACK_S0( s0, s1, t1 ) \
|
||||||
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
|
s1 = v128_movlane32( s1, 0, s0, 3 ); \
|
||||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
s0 = mask_1000( s0 )
|
||||||
s16 = _mm_xor_si128(s16, t1);\
|
|
||||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
|
||||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
|
||||||
t1 = _mm_slli_si128(t1, 8);\
|
|
||||||
s8 = _mm_xor_si128(s8, t1);\
|
|
||||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
|
||||||
s0 = _mm_xor_si128(s0, t1);\
|
|
||||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
|
||||||
s4 = _mm_xor_si128(s4, t1)
|
|
||||||
|
|
||||||
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
|
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
|
||||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
t1 = shuffle_3303( s0 ); \
|
||||||
s22 = _mm_xor_si128(s22, t1);\
|
s22 = v128_xor(s22, t1);\
|
||||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
|
||||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
s0 = v128_movlane32( s0, 0, t1, 0 ); \
|
||||||
t1 = _mm_slli_si128(t1, 8);\
|
t1 = v128_alignr64( t1, v128_zero, 1 ); \
|
||||||
s8 = _mm_xor_si128(s8, t1);\
|
s8 = v128_xor(s8, t1);\
|
||||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
t1 = shuffle_3303( s24 ); \
|
||||||
s0 = _mm_xor_si128(s0, t1);\
|
s0 = v128_xor(s0, t1);\
|
||||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
t1 = shuffle_3303( s27 ); \
|
||||||
s4 = _mm_xor_si128(s4, t1);\
|
s4 = v128_xor(s4, t1);\
|
||||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
t1 = shuffle_3303( s30 ); \
|
||||||
s7 = _mm_xor_si128(s7, t1)
|
s7 = v128_xor(s7, t1)
|
||||||
|
|
||||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
#define SUBSTITUTE( r0, _t2 ) \
|
||||||
t2 = t0;\
|
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
|
||||||
t3 = _mm_add_epi8(t0, t0);\
|
_t2 = v128_aesenclast_nokey( _t2 )
|
||||||
t4 = _mm_add_epi8(t3, t3);\
|
|
||||||
t1 = _mm_srli_epi16(t0, 6);\
|
|
||||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
|
||||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
|
||||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
|
||||||
|
|
||||||
/*
|
|
||||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
|
||||||
s1 = x;\
|
|
||||||
s2 = _mm_add_epi8(x, x);\
|
|
||||||
t2 = _mm_add_epi8(s2, s2);\
|
|
||||||
t1 = _mm_srli_epi16(x, 6);\
|
|
||||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
|
||||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
|
||||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define SUBSTITUTE(r0, _t2 )\
|
|
||||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
|
||||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
|
||||||
|
|
||||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||||
t2 = t0;\
|
t2 = t0;\
|
||||||
t3 = _mm_add_epi8(t0, t0);\
|
t3 = v128_add8( t0, t0 ); \
|
||||||
t4 = _mm_add_epi8(t3, t3);\
|
t4 = v128_add8( t3, t3 ); \
|
||||||
t1 = _mm_srli_epi16(t0, 6);\
|
t1 = v128_sr16( t0, 6 ); \
|
||||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
t1 = v128_and( t1, _lsbmask2 ); \
|
||||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
|
||||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
t4 = v128_shuffle8( t2, _supermix1b ); \
|
||||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
|
||||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
t1 = v128_shuffle8( t4, _supermix1c ); \
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = v128_xor( t4, t1 ); \
|
||||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
t1 = v128_shuffle8( t4, _supermix1d ); \
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
t4 = v128_xor( t4, t1 ); \
|
||||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
t1 = v128_shuffle8( t2, _supermix1a ); \
|
||||||
t2 = mm128_xor3(t2, t3, t0 );\
|
t2 = v128_xor3( t2, t3, t0 ); \
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
t2 = v128_shuffle8( t2, _supermix7a ); \
|
||||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
t4 = v128_xor3( t4, t1, t2 ); \
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
t2 = v128_shuffle8( t2, _supermix7b ); \
|
||||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
t3 = v128_shuffle8( t3, _supermix2a ); \
|
||||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
t1 = v128_shuffle8( t0, _supermix4a ); \
|
||||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
t0 = v128_shuffle8( t0, _supermix4b ); \
|
||||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
t4 = v128_xor3( t4, t2, t1 ); \
|
||||||
t0 = _mm_xor_si128(t0, t3);\
|
t0 = v128_xor( t0, t3 ); \
|
||||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
|
||||||
|
|
||||||
/*
|
|
||||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
|
||||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
|
||||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
|
||||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
|
||||||
t4 = t1;\
|
|
||||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
|
||||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
|
||||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
|
||||||
t2 = mm128_xor3(t2, t3, t0 );\
|
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
|
||||||
t4 = _mm_xor_si128(t4, t2);\
|
|
||||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
|
||||||
t4 = _mm_xor_si128(t4, t2);\
|
|
||||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
|
||||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
|
||||||
t4 = _mm_xor_si128(t4, t1);\
|
|
||||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
|
||||||
t0 = _mm_xor_si128(t0, t3);\
|
|
||||||
t4 = _mm_xor_si128(t4, t0);\
|
|
||||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
|
||||||
t4 = _mm_xor_si128(t4, t0)
|
|
||||||
|
|
||||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
|
||||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
|
||||||
PACK_S0(r1c, r1a, _t0);\
|
|
||||||
SUBSTITUTE(r1c, _t2 );\
|
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
|
||||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
|
||||||
r2c = _mm_xor_si128(r2c, _t0);\
|
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
|
||||||
r2d = _mm_xor_si128(r2d, _t0);\
|
|
||||||
UNPACK_S0(r1c, r1a, _t3);\
|
|
||||||
SUBSTITUTE(r2c, _t2 );\
|
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
|
||||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
|
||||||
r3c = _mm_xor_si128(r3c, _t0);\
|
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
|
||||||
r3d = _mm_xor_si128(r3d, _t0);\
|
|
||||||
UNPACK_S0(r2c, r2a, _t3);\
|
|
||||||
SUBSTITUTE(r3c, _t2 );\
|
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
|
||||||
UNPACK_S0(r3c, r3a, _t3)
|
|
||||||
|
|
||||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||||
PACK_S0(r1c, r1a, _t0);\
|
PACK_S0(r1c, r1a, _t0);\
|
||||||
SUBSTITUTE( r1c, _t2 );\
|
SUBSTITUTE( r1c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
_t0 = shuffle_0321( r1c ); \
|
||||||
r2c = _mm_xor_si128(r2c, _t0);\
|
r2c = v128_xor(r2c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mask_1000( _t0 ); \
|
||||||
r2d = _mm_xor_si128(r2d, _t0);\
|
r2d = v128_xor(r2d, _t0);\
|
||||||
UNPACK_S0(r1c, r1a, _t3);\
|
UNPACK_S0(r1c, r1a, _t3);\
|
||||||
SUBSTITUTE(r2c, _t2 );\
|
SUBSTITUTE(r2c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
_t0 = shuffle_0321( r2c ); \
|
||||||
r3c = _mm_xor_si128(r3c, _t0);\
|
r3c = v128_xor(r3c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mask_1000( _t0 ); \
|
||||||
r3d = _mm_xor_si128(r3d, _t0);\
|
r3d = v128_xor(r3d, _t0);\
|
||||||
UNPACK_S0(r2c, r2a, _t3);\
|
UNPACK_S0(r2c, r2a, _t3);\
|
||||||
SUBSTITUTE( r3c, _t2 );\
|
SUBSTITUTE( r3c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
_t0 = shuffle_0321( r3c ); \
|
||||||
r4c = _mm_xor_si128(r4c, _t0);\
|
r4c = v128_xor(r4c, _t0);\
|
||||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
_t0 = mask_1000( _t0 ); \
|
||||||
r4d = _mm_xor_si128(r4d, _t0);\
|
r4d = v128_xor(r4d, _t0);\
|
||||||
UNPACK_S0(r3c, r3a, _t3);\
|
UNPACK_S0(r3c, r3a, _t3);\
|
||||||
SUBSTITUTE( r4c, _t2 );\
|
SUBSTITUTE( r4c, _t2 );\
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||||
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
|
|||||||
block[1] = col[(base + a + 1) % s];\
|
block[1] = col[(base + a + 1) % s];\
|
||||||
block[2] = col[(base + a + 2) % s];\
|
block[2] = col[(base + a + 2) % s];\
|
||||||
block[3] = col[(base + a + 3) % s];\
|
block[3] = col[(base + a + 3) % s];\
|
||||||
x = _mm_load_si128((__m128i*)block)
|
x = v128_load( (v128_t*)block )
|
||||||
|
|
||||||
#define STORECOLUMN(x, s)\
|
#define STORECOLUMN(x, s)\
|
||||||
_mm_store_si128((__m128i*)block, x);\
|
v128_store((v128_t*)block, x );\
|
||||||
col[(base + 0) % s] = block[0];\
|
col[(base + 0) % s] = block[0];\
|
||||||
col[(base + 1) % s] = block[1];\
|
col[(base + 1) % s] = block[1];\
|
||||||
col[(base + 2) % s] = block[2];\
|
col[(base + 2) % s] = block[2];\
|
||||||
col[(base + 3) % s] = block[3]
|
col[(base + 3) % s] = block[3]
|
||||||
|
|
||||||
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
|
||||||
|
unsigned int uBlockCount )
|
||||||
{
|
{
|
||||||
__m128i _t0, _t1, _t2, _t3;
|
v128_t _t0, _t1, _t2, _t3;
|
||||||
|
|
||||||
switch(ctx->base)
|
switch(ctx->base)
|
||||||
{
|
{
|
||||||
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
|||||||
pmsg += 4;
|
pmsg += 4;
|
||||||
uBlockCount--;
|
uBlockCount--;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
void Final512( hashState_fugue *ctx, uint8_t *hashval )
|
||||||
{
|
{
|
||||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||||
unsigned int i, base;
|
unsigned int i, base;
|
||||||
__m128i r0, _t0, _t1, _t2, _t3;
|
v128_t r0, _t0, _t1, _t2, _t3;
|
||||||
|
|
||||||
for(i = 0; i < 12; i++)
|
for( i = 0; i < 12; i++ )
|
||||||
{
|
{
|
||||||
_mm_store_si128((__m128i*)block, ctx->state[i]);
|
v128_store( (v128_t*)block, ctx->state[i] );
|
||||||
|
|
||||||
col[3 * i + 0] = block[0];
|
col[3 * i + 0] = block[0];
|
||||||
col[3 * i + 1] = block[1];
|
col[3 * i + 1] = block[1];
|
||||||
col[3 * i + 2] = block[2];
|
col[3 * i + 2] = block[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
base = (36 - (12 * ctx->base)) % 36;
|
base = ( 36 - (12 * ctx->base) ) % 36;
|
||||||
|
|
||||||
for(i = 0; i < 32; i++)
|
for( i = 0; i < 32; i++ )
|
||||||
{
|
{
|
||||||
// ROR3
|
// ROR3
|
||||||
base = (base + 33) % 36;
|
base = (base + 33) % 36;
|
||||||
|
|
||||||
// CMIX
|
// CMIX
|
||||||
col[(base + 0) % 36] ^= col[(base + 4) % 36];
|
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||||
col[(base + 1) % 36] ^= col[(base + 5) % 36];
|
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||||
col[(base + 2) % 36] ^= col[(base + 6) % 36];
|
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||||
col[(base + 18) % 36] ^= col[(base + 4) % 36];
|
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||||
col[(base + 19) % 36] ^= col[(base + 5) % 36];
|
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||||
col[(base + 20) % 36] ^= col[(base + 6) % 36];
|
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN( r0, 36, 0 );
|
||||||
SUBSTITUTE(r0, _t2);
|
SUBSTITUTE( r0, _t2 );
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN( r0, 36 );
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < 13; i++)
|
for( i = 0; i < 13; i++ )
|
||||||
{
|
{
|
||||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
|
|
||||||
// ROR9
|
// ROR9
|
||||||
base = (base + 27) % 36;
|
base = (base + 27) % 36;
|
||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN( r0, 36, 0 );
|
||||||
SUBSTITUTE(r0, _t2);
|
SUBSTITUTE( r0, _t2 );
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN( r0, 36 );
|
||||||
|
|
||||||
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
|
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
|
||||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
|
|
||||||
// ROR9
|
// ROR9
|
||||||
base = (base + 27) % 36;
|
base = (base + 27) % 36;
|
||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN( r0, 36, 0 );
|
||||||
SUBSTITUTE(r0, _t2);
|
SUBSTITUTE( r0, _t2 );
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN( r0, 36 );
|
||||||
|
|
||||||
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
|
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
|
||||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
|
|
||||||
// ROR9
|
// ROR9
|
||||||
base = (base + 27) % 36;
|
base = (base + 27) % 36;
|
||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN( r0, 36, 0 );
|
||||||
SUBSTITUTE(r0, _t2);
|
SUBSTITUTE( r0, _t2 );
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN( r0, 36 );
|
||||||
|
|
||||||
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
|
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
|
||||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 28) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
|
|
||||||
// ROR8
|
// ROR8
|
||||||
base = (base + 28) % 36;
|
base = (base + 28) % 36;
|
||||||
|
|
||||||
// SMIX
|
// SMIX
|
||||||
LOADCOLUMN(r0, 36, 0);
|
LOADCOLUMN( r0, 36, 0 );
|
||||||
SUBSTITUTE(r0, _t2);
|
SUBSTITUTE( r0, _t2 );
|
||||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||||
STORECOLUMN(r0, 36);
|
STORECOLUMN( r0, 36 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||||
|
|
||||||
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
|
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
|
||||||
LOADCOLUMN(r0, 36, 1);
|
LOADCOLUMN( r0, 36, 1 );
|
||||||
_mm_store_si128((__m128i*)hashval, r0);
|
v128_store( (v128_t*)hashval, r0 );
|
||||||
|
|
||||||
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
|
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
|
||||||
LOADCOLUMN(r0, 36, 9);
|
LOADCOLUMN( r0, 36, 9 );
|
||||||
_mm_store_si128((__m128i*)hashval + 1, r0);
|
v128_store( (v128_t*)hashval + 1, r0 );
|
||||||
|
|
||||||
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
|
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
|
||||||
LOADCOLUMN(r0, 36, 18);
|
LOADCOLUMN( r0, 36, 18 );
|
||||||
_mm_store_si128((__m128i*)hashval + 2, r0);
|
v128_store( (v128_t*)hashval + 2, r0 );
|
||||||
|
|
||||||
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
|
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
|
||||||
LOADCOLUMN(r0, 36, 27);
|
LOADCOLUMN( r0, 36, 27 );
|
||||||
_mm_store_si128((__m128i*)hashval + 3, r0);
|
v128_store( (v128_t*)hashval + 3, r0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
ctx->processed_bits = 0;
|
ctx->processed_bits = 0;
|
||||||
@@ -485,20 +424,20 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
|||||||
ctx->uBlockLength = 4;
|
ctx->uBlockLength = 4;
|
||||||
|
|
||||||
for(i = 0; i < 6; i++)
|
for(i = 0; i < 6; i++)
|
||||||
ctx->state[i] = m128_zero;
|
ctx->state[i] = v128_zero;
|
||||||
|
|
||||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
ctx->state[6] = casti_v128( _IV512, 0 );
|
||||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
ctx->state[7] = casti_v128( _IV512, 1 );
|
||||||
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
|
ctx->state[8] = casti_v128( _IV512, 2 );
|
||||||
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
|
ctx->state[9] = casti_v128( _IV512, 3 );
|
||||||
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
|
ctx->state[10] = casti_v128( _IV512, 4 );
|
||||||
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
|
ctx->state[11] = casti_v128( _IV512, 5 );
|
||||||
|
|
||||||
return SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
|
uint64_t databitlen )
|
||||||
{
|
{
|
||||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||||
|
|
||||||
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
|||||||
if(state->uBufferBytes != 0)
|
if(state->uBufferBytes != 0)
|
||||||
{
|
{
|
||||||
// Fill the buffer
|
// Fill the buffer
|
||||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
memcpy( state->buffer + state->uBufferBytes, (void*)data,
|
||||||
|
state->uBlockLength - state->uBufferBytes );
|
||||||
|
|
||||||
// Process the buffer
|
// Process the buffer
|
||||||
Compress512(state, state->buffer, 1);
|
Compress512(state, state->buffer, 1);
|
||||||
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
|||||||
state->uBufferBytes += uByteLength;
|
state->uBufferBytes += uByteLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
return SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
int fugue512_Final( hashState_fugue *state, void *hashval )
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
BitSequence lengthbuf[8] __attribute__((aligned(64)));
|
uint8_t lengthbuf[8] __attribute__((aligned(64)));
|
||||||
|
|
||||||
// Update message bit count
|
// Update message bit count
|
||||||
state->processed_bits += state->uBufferBytes * 8;
|
state->processed_bits += state->uBufferBytes * 8;
|
||||||
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
|||||||
// Finalization
|
// Finalization
|
||||||
Final512(state, hashval);
|
Final512(state, hashval);
|
||||||
|
|
||||||
return SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
|
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||||
|
uint64_t databitlen )
|
||||||
{
|
{
|
||||||
fugue512_Init(hs, 512);
|
fugue512_Init( hs, 512 );
|
||||||
fugue512_Update(hs, data, databitlen*8);
|
fugue512_Update( hs, data, databitlen*8 );
|
||||||
fugue512_Final(hs, hashval);
|
fugue512_Final( hs, hashval );
|
||||||
return SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AES
|
#endif // AES
|
||||||
|
@@ -14,37 +14,31 @@
|
|||||||
#ifndef FUGUE_HASH_API_H
|
#ifndef FUGUE_HASH_API_H
|
||||||
#define FUGUE_HASH_API_H
|
#define FUGUE_HASH_API_H
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||||
|
|
||||||
#if !defined(__SSE4_1__)
|
|
||||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "compat/sha3_common.h"
|
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
__m128i state[12];
|
v128_t state[12];
|
||||||
unsigned int base;
|
unsigned int base;
|
||||||
|
|
||||||
unsigned int uHashSize;
|
unsigned int uHashSize;
|
||||||
unsigned int uBlockLength;
|
unsigned int uBlockLength;
|
||||||
unsigned int uBufferBytes;
|
unsigned int uBufferBytes;
|
||||||
DataLength processed_bits;
|
uint64_t processed_bits;
|
||||||
BitSequence buffer[4];
|
uint8_t buffer[4];
|
||||||
|
|
||||||
} hashState_fugue __attribute__ ((aligned (64)));
|
} hashState_fugue __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
|
||||||
// These functions are deprecated, use the lower case macro aliases that use
|
// These functions are deprecated, use the lower case macro aliases that use
|
||||||
// the standard interface. This will be cleaned up at a later date.
|
// the standard interface. This will be cleaned up at a later date.
|
||||||
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
|
int fugue512_Init( hashState_fugue *state, int hashbitlen );
|
||||||
|
|
||||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
|
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||||
|
uint64_t databitlen );
|
||||||
|
|
||||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
int fugue512_Final( hashState_fugue *state, void *hashval );
|
||||||
|
|
||||||
#define fugue512_init( state ) \
|
#define fugue512_init( state ) \
|
||||||
fugue512_Init( state, 512 )
|
fugue512_Init( state, 512 )
|
||||||
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
|||||||
fugue512_Final
|
fugue512_Final
|
||||||
|
|
||||||
|
|
||||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
|
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||||
|
uint64_t databitlen);
|
||||||
|
|
||||||
#endif // AES
|
#endif // AES
|
||||||
#endif // HASH_API_H
|
#endif // HASH_API_H
|
||||||
|
@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
|||||||
|
|
||||||
static void AddXor512(const void *a,const void *b,void *c)
|
static void AddXor512(const void *a,const void *b,void *c)
|
||||||
{
|
{
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||||
casti_m512i( b, 0 ) );
|
casti_m512i( b, 0 ) );
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
|
|||||||
casti_m256i( b, 0 ) );
|
casti_m256i( b, 0 ) );
|
||||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||||
casti_m256i( b, 1 ) );
|
casti_m256i( b, 1 ) );
|
||||||
#elif defined(__SSE2__)
|
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
|
||||||
casti_m128i( b, 0 ) );
|
casti_v128( b, 0 ) );
|
||||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
|
||||||
casti_m128i( b, 1 ) );
|
casti_v128( b, 1 ) );
|
||||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
|
||||||
casti_m128i( b, 2 ) );
|
casti_v128( b, 2 ) );
|
||||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
|
||||||
casti_m128i( b, 3 ) );
|
casti_v128( b, 3 ) );
|
||||||
#else
|
#else
|
||||||
const unsigned long long *A=a, *B=b;
|
const unsigned long long *A=a, *B=b;
|
||||||
unsigned long long *C=c;
|
unsigned long long *C=c;
|
||||||
|
@@ -60,21 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
|||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
// No fast shuffle on NEON
|
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||||
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
|
||||||
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|
||||||
|
|
||||||
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
|
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
|
||||||
|
|
||||||
//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define tos(a) #a
|
#define tos(a) #a
|
||||||
#define tostr(a) tos(a)
|
#define tostr(a) tos(a)
|
||||||
|
|
||||||
@@ -107,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
@@ -301,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
*/
|
*/
|
||||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* SubBytes */\
|
/* SubBytes */\
|
||||||
b0 = v128_xor(b0, b0);\
|
a0 = v128_aesenclast_nokey( a0 ); \
|
||||||
a0 = v128_aesenclast(a0, b0);\
|
a1 = v128_aesenclast_nokey( a1 ); \
|
||||||
a1 = v128_aesenclast(a1, b0);\
|
a2 = v128_aesenclast_nokey( a2 ); \
|
||||||
a2 = v128_aesenclast(a2, b0);\
|
a3 = v128_aesenclast_nokey( a3 ); \
|
||||||
a3 = v128_aesenclast(a3, b0);\
|
a4 = v128_aesenclast_nokey( a4 ); \
|
||||||
a4 = v128_aesenclast(a4, b0);\
|
a5 = v128_aesenclast_nokey( a5 ); \
|
||||||
a5 = v128_aesenclast(a5, b0);\
|
a6 = v128_aesenclast_nokey( a6 ); \
|
||||||
a6 = v128_aesenclast(a6, b0);\
|
a7 = v128_aesenclast_nokey( a7 ); \
|
||||||
a7 = v128_aesenclast(a7, b0);\
|
|
||||||
/* MixBytes */\
|
/* MixBytes */\
|
||||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ROUNDS_P(){\
|
#define ROUNDS_P(){\
|
||||||
@@ -329,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
|
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
|
||||||
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
|
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
|
||||||
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
|
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
|
||||||
/* SubBytes + MixBytes */\
|
/* SubBytes + MixBytes */\
|
||||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
||||||
\
|
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm0 = v128_xor( xmm0, \
|
xmm0 = v128_xor( xmm0, \
|
||||||
casti_v128( round_const_p, round_counter+1 ) ); \
|
casti_v128( round_const_p, round_counter+1 ) ); \
|
||||||
@@ -434,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
t1 = v128_unpackhi16(t1, i3);\
|
t1 = v128_unpackhi16(t1, i3);\
|
||||||
i2 = v128_unpacklo16(i2, i3);\
|
i2 = v128_unpacklo16(i2, i3);\
|
||||||
i0 = v128_unpacklo16(i0, i1);\
|
i0 = v128_unpacklo16(i0, i1);\
|
||||||
\
|
|
||||||
/* shuffle with immediate */\
|
/* shuffle with immediate */\
|
||||||
t0 = gr_shuffle32( t0 ); \
|
t0 = gr_shuffle32( t0 ); \
|
||||||
t1 = gr_shuffle32( t1 ); \
|
t1 = gr_shuffle32( t1 ); \
|
||||||
@@ -444,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
i2 = gr_shuffle32( i2 ); \
|
i2 = gr_shuffle32( i2 ); \
|
||||||
i4 = gr_shuffle32( i4 ); \
|
i4 = gr_shuffle32( i4 ); \
|
||||||
i6 = gr_shuffle32( i6 ); \
|
i6 = gr_shuffle32( i6 ); \
|
||||||
\
|
|
||||||
/* continue with unpack */\
|
/* continue with unpack */\
|
||||||
t4 = i0;\
|
t4 = i0;\
|
||||||
i0 = v128_unpacklo32(i0, i2);\
|
i0 = v128_unpacklo32(i0, i2);\
|
||||||
@@ -551,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
|||||||
/* transpose done */\
|
/* transpose done */\
|
||||||
}/**/
|
}/**/
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// not used
|
||||||
void INIT( v128_t* chaining )
|
void INIT( v128_t* chaining )
|
||||||
{
|
{
|
||||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||||
@@ -580,6 +573,7 @@ void INIT( v128_t* chaining )
|
|||||||
chaining[6] = xmm14;
|
chaining[6] = xmm14;
|
||||||
chaining[7] = xmm15;
|
chaining[7] = xmm15;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void TF1024( v128_t* chaining, const v128_t* message )
|
void TF1024( v128_t* chaining, const v128_t* message )
|
||||||
{
|
{
|
||||||
|
@@ -1,3 +1,6 @@
|
|||||||
|
#if !defined GROESTL256_INTR_AES_H__
|
||||||
|
#define GROESTL256_INTR_AES_H__
|
||||||
|
|
||||||
/* groestl-intr-aes.h Aug 2011
|
/* groestl-intr-aes.h Aug 2011
|
||||||
*
|
*
|
||||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||||
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
|||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
// No fast shuffle on NEON
|
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
|
||||||
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
|
||||||
|
|
||||||
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define tos(a) #a
|
#define tos(a) #a
|
||||||
#define tostr(a) tos(a)
|
#define tostr(a) tos(a)
|
||||||
|
|
||||||
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
|||||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||||
K. Matusiewicz, 2011/05/29 */
|
K. Matusiewicz, 2011/05/29 */
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* t_i = a_i + a_{i+1} */\
|
/* t_i = a_i + a_{i+1} */\
|
||||||
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
|
|||||||
chaining[3] = xmm11;
|
chaining[3] = xmm11;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
|
|||||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||||
uint64_t blocks = len / SIZE512;
|
uint64_t blocks = len / SIZE512;
|
||||||
v128_t* in = (v128_t*)input;
|
v128_t* in = (v128_t*)input;
|
||||||
|
|
||||||
// digest any full blocks, process directly from input
|
// digest any full blocks, process directly from input
|
||||||
for ( i = 0; i < blocks; i++ )
|
for ( i = 0; i < blocks; i++ )
|
||||||
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
|
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
|
||||||
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
|
|||||||
|
|
||||||
// digest final padding block and do output transform
|
// digest final padding block and do output transform
|
||||||
TF1024( ctx->chaining, ctx->buffer );
|
TF1024( ctx->chaining, ctx->buffer );
|
||||||
|
|
||||||
OF1024( ctx->chaining );
|
OF1024( ctx->chaining );
|
||||||
|
|
||||||
// store hash result in output
|
// store hash result in output
|
||||||
|
@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
|
|||||||
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
|
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
|
||||||
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
|
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
|
||||||
#define groestl512_full groestl512
|
#define groestl512_full groestl512
|
||||||
|
#define groestl512_ctx groestl512
|
||||||
|
|
||||||
|
|
||||||
#endif /* __hash_h */
|
#endif /* __hash_h */
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#define GROESTL_4WAY_VAES 1
|
#define GROESTL_4WAY_VAES 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__) && defined(__VAES__)
|
#if defined(__AVX2__) && defined(__VAES__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
|
|
||||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||||
|
@@ -43,7 +43,7 @@
|
|||||||
|
|
||||||
#define SIZE256 (SIZE_512/16)
|
#define SIZE256 (SIZE_512/16)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||||
|
@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
|||||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||||
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
|
|
||||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||||
/* AddRoundConstant */\
|
/* AddRoundConstant */\
|
||||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
|
||||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||||
a1 = _mm256_xor_si256( a1, b1 );\
|
a1 = _mm256_xor_si256( a1, b1 );\
|
||||||
a2 = _mm256_xor_si256( a2, b1 );\
|
a2 = _mm256_xor_si256( a2, b1 );\
|
||||||
|
@@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
#if defined(__AVX2__) && defined(__VAES__)
|
#if defined(__AVX2__) && defined(__VAES__)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||||
{
|
{
|
||||||
|
@@ -33,7 +33,7 @@
|
|||||||
|
|
||||||
#define SIZE512 (SIZE_1024/16)
|
#define SIZE512 (SIZE_1024/16)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||||
|
@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
|||||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||||
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
{ \
|
{ \
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
||||||
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
\
|
\
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
||||||
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||||
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
||||||
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
|||||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||||
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
||||||
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
{ \
|
{ \
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
casti_v128u32( round_const_p, round_counter ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
||||||
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
\
|
\
|
||||||
/* AddRoundConstant P1024 */\
|
/* AddRoundConstant P1024 */\
|
||||||
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
||||||
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||||
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
casti_v128u32( round_const_q, round_counter ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
||||||
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
|||||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||||
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
|
||||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
|
||||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
||||||
|
@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||||
do {
|
do {
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
|
||||||
|
|
||||||
myriad_4way_hash( hash, vdata );
|
myriad_4way_hash( hash, vdata );
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
|
@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
|
|||||||
init_myrgr_ctx();
|
init_myrgr_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_myriad;
|
gate->scanhash = (void*)&scanhash_myriad;
|
||||||
gate->hash = (void*)&myriad_hash;
|
gate->hash = (void*)&myriad_hash;
|
||||||
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
|
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#define MYRGR_8WAY 1
|
#define MYRGR_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
|
||||||
#define MYRGR_4WAY 1
|
#define MYRGR_4WAY 1
|
||||||
|
@@ -35,8 +35,6 @@
|
|||||||
|
|
||||||
#include "sph_groestl.h"
|
#include "sph_groestl.h"
|
||||||
|
|
||||||
#if !defined(__AES__)
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C"{
|
extern "C"{
|
||||||
#endif
|
#endif
|
||||||
@@ -3119,5 +3117,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // !AES
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -42,7 +42,6 @@ extern "C"{
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "compat/sph_types.h"
|
#include "compat/sph_types.h"
|
||||||
|
|
||||||
#if !defined(__AES__)
|
|
||||||
/**
|
/**
|
||||||
* Output size (in bits) for Groestl-224.
|
* Output size (in bits) for Groestl-224.
|
||||||
*/
|
*/
|
||||||
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // !AES
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
|||||||
#define S1F MF
|
#define S1F MF
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Hamsi 8 way AVX512
|
// Hamsi 8 way AVX512
|
||||||
|
|
||||||
@@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
|||||||
|
|
||||||
// Hamsi 4 way AVX2
|
// Hamsi 4 way AVX2
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define INPUT_BIG \
|
#define INPUT_BIG \
|
||||||
do { \
|
do { \
|
||||||
@@ -1501,7 +1501,7 @@ do { /* order is important */ \
|
|||||||
sc->h[14] = CE; \
|
sc->h[14] = CE; \
|
||||||
sc->h[15] = CF;
|
sc->h[15] = CF;
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define INPUT_8X32 \
|
#define INPUT_8X32 \
|
||||||
{ \
|
{ \
|
||||||
|
@@ -38,7 +38,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
// SSE2 or NEON Hamsi-512 2x64
|
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
|||||||
size_t len );
|
size_t len );
|
||||||
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined (__AVX2__)
|
||||||
|
|
||||||
// Hamsi-512 4x64
|
// Hamsi-512 4x64
|
||||||
@@ -102,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// Hamsi-512 8x64
|
// Hamsi-512 8x64
|
||||||
|
|
||||||
|
@@ -53,7 +53,7 @@ extern "C"{
|
|||||||
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
||||||
//#endif
|
//#endif
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define v128_andnotxor( a, b, c ) \
|
#define v128_andnotxor( a, b, c ) \
|
||||||
@@ -583,7 +583,7 @@ do { \
|
|||||||
|
|
||||||
// Haval-256 8 way 32 bit avx2
|
// Haval-256 8 way 32 bit avx2
|
||||||
|
|
||||||
#if defined (__AVX512VL__)
|
#if defined (VL256)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define mm256_andnotxor( a, b, c ) \
|
#define mm256_andnotxor( a, b, c ) \
|
||||||
@@ -882,7 +882,7 @@ do { \
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
// ( ~( a ^ b ) ) & c
|
// ( ~( a ^ b ) ) & c
|
||||||
#define mm512_andnotxor( a, b, c ) \
|
#define mm512_andnotxor( a, b, c ) \
|
||||||
|
@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__m512i buf[32];
|
__m512i buf[32];
|
||||||
|
@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
|
|||||||
(state)->H[15] = h7l; \
|
(state)->H[15] = h7l; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||||
{ \
|
{ \
|
||||||
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
|
|||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_256, not used with AVX512VL
|
|
||||||
|
|
||||||
#define notxorandnot( a, b, c ) \
|
#define notxorandnot( a, b, c ) \
|
||||||
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
||||||
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
|
|||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
void jh256_8x64_init( jh_8x64_context *sc )
|
void jh256_8x64_init( jh_8x64_context *sc )
|
||||||
{
|
{
|
||||||
|
@@ -55,7 +55,7 @@
|
|||||||
* <code>memcpy()</code>).
|
* <code>memcpy()</code>).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
@@ -78,7 +78,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
const int thr_id = mythr->id;
|
const int thr_id = mythr->id;
|
||||||
const bool bench = opt_benchmark;
|
const bool bench = opt_benchmark;
|
||||||
@@ -108,4 +108,53 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(KECCAK_2WAY)
|
||||||
|
|
||||||
|
void keccakhash_2x64(void *state, const void *input)
|
||||||
|
{
|
||||||
|
keccak256_2x64_context ctx;
|
||||||
|
keccak256_2x64_init( &ctx );
|
||||||
|
keccak256_2x64_update( &ctx, input, 80 );
|
||||||
|
keccak256_2x64_close( &ctx, state );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
v128_t *noncev = (v128_t*)vdata + 9;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
|
||||||
|
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||||
|
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||||
|
do {
|
||||||
|
keccakhash_2x64( hash, vdata );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 2; lane++ )
|
||||||
|
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
|
||||||
|
{
|
||||||
|
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||||
|
if ( valid_hash( lane_hash, ptarget ))
|
||||||
|
{
|
||||||
|
pdata[19] = bswap_32( n + lane );
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||||
|
n += 2;
|
||||||
|
} while ( (n < max_nonce-2) && !work_restart[thr_id].restart);
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce + 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -17,6 +17,9 @@ bool register_keccak_algo( algo_gate_t* gate )
|
|||||||
#elif defined (KECCAK_4WAY)
|
#elif defined (KECCAK_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||||
gate->hash = (void*)&keccakhash_4way;
|
gate->hash = (void*)&keccakhash_4way;
|
||||||
|
#elif defined (KECCAK_2WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||||
|
gate->hash = (void*)&keccakhash_2x64;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_keccak;
|
gate->scanhash = (void*)&scanhash_keccak;
|
||||||
gate->hash = (void*)&keccakhash;
|
gate->hash = (void*)&keccakhash;
|
||||||
@@ -37,6 +40,9 @@ bool register_keccakc_algo( algo_gate_t* gate )
|
|||||||
#elif defined (KECCAK_4WAY)
|
#elif defined (KECCAK_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_keccak_4way;
|
gate->scanhash = (void*)&scanhash_keccak_4way;
|
||||||
gate->hash = (void*)&keccakhash_4way;
|
gate->hash = (void*)&keccakhash_4way;
|
||||||
|
#elif defined (KECCAK_2WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_keccak_2x64;
|
||||||
|
gate->hash = (void*)&keccakhash_2x64;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_keccak;
|
gate->scanhash = (void*)&scanhash_keccak;
|
||||||
gate->hash = (void*)&keccakhash;
|
gate->hash = (void*)&keccakhash;
|
||||||
@@ -75,15 +81,17 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
|||||||
bool register_sha3d_algo( algo_gate_t* gate )
|
bool register_sha3d_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
hard_coded_eb = 6;
|
hard_coded_eb = 6;
|
||||||
// opt_extranonce = false;
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
|
||||||
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
|
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
|
||||||
#if defined (KECCAK_8WAY)
|
#if defined (SHA3D_8WAY)
|
||||||
gate->scanhash = (void*)&scanhash_sha3d_8way;
|
gate->scanhash = (void*)&scanhash_sha3d_8way;
|
||||||
gate->hash = (void*)&sha3d_hash_8way;
|
gate->hash = (void*)&sha3d_hash_8way;
|
||||||
#elif defined (KECCAK_4WAY)
|
#elif defined (SHA3D_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_sha3d_4way;
|
gate->scanhash = (void*)&scanhash_sha3d_4way;
|
||||||
gate->hash = (void*)&sha3d_hash_4way;
|
gate->hash = (void*)&sha3d_hash_4way;
|
||||||
|
#elif defined (SHA3D_2WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_sha3d_2x64;
|
||||||
|
gate->hash = (void*)&sha3d_hash_2x64;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_sha3d;
|
gate->scanhash = (void*)&scanhash_sha3d;
|
||||||
gate->hash = (void*)&sha3d_hash;
|
gate->hash = (void*)&sha3d_hash;
|
||||||
|
@@ -4,10 +4,20 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define KECCAK_8WAY 1
|
#define KECCAK_8WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define KECCAK_4WAY 1
|
#define KECCAK_4WAY 1
|
||||||
|
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
#define KECCAK_2WAY 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(SIMD512)
|
||||||
|
#define SHA3D_8WAY 1
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
#define SHA3D_4WAY 1
|
||||||
|
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||||
|
#define SHA3D_2WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern int hard_coded_eb;
|
extern int hard_coded_eb;
|
||||||
@@ -16,27 +26,47 @@ extern int hard_coded_eb;
|
|||||||
|
|
||||||
void keccakhash_8way( void *state, const void *input );
|
void keccakhash_8way( void *state, const void *input );
|
||||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void sha3d_hash_8way( void *state, const void *input );
|
|
||||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
|
||||||
|
|
||||||
#elif defined(KECCAK_4WAY)
|
#elif defined(KECCAK_4WAY)
|
||||||
|
|
||||||
void keccakhash_4way( void *state, const void *input );
|
void keccakhash_4way( void *state, const void *input );
|
||||||
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void sha3d_hash_4way( void *state, const void *input );
|
#elif defined(KECCAK_2WAY)
|
||||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
void keccakhash_2x64( void *state, const void *input );
|
||||||
|
int scanhash_keccak_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void keccakhash( void *state, const void *input );
|
void keccakhash( void *state, const void *input );
|
||||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(SHA3D_8WAY)
|
||||||
|
|
||||||
|
void sha3d_hash_8way( void *state, const void *input );
|
||||||
|
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#elif defined(SHA3D_4WAY)
|
||||||
|
|
||||||
|
void sha3d_hash_4way( void *state, const void *input );
|
||||||
|
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#elif defined(SHA3D_2WAY)
|
||||||
|
|
||||||
|
void sha3d_hash_2x64( void *state, const void *input );
|
||||||
|
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
void sha3d_hash( void *state, const void *input );
|
void sha3d_hash( void *state, const void *input );
|
||||||
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
||||||
|
@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
|
|||||||
|
|
||||||
#define DO(x) x
|
#define DO(x) x
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define INPUT_BUF(size) do { \
|
#define INPUT_BUF(size) do { \
|
||||||
size_t j; \
|
size_t j; \
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "keccak-hash-4way.h"
|
#include "keccak-hash-4way.h"
|
||||||
|
|
||||||
#if defined(KECCAK_8WAY)
|
#if defined(SHA3D_8WAY)
|
||||||
|
|
||||||
void sha3d_hash_8way(void *state, const void *input)
|
void sha3d_hash_8way(void *state, const void *input)
|
||||||
{
|
{
|
||||||
@@ -64,7 +64,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(KECCAK_4WAY)
|
#elif defined(SHA3D_4WAY)
|
||||||
|
|
||||||
void sha3d_hash_4way(void *state, const void *input)
|
void sha3d_hash_4way(void *state, const void *input)
|
||||||
{
|
{
|
||||||
@@ -122,4 +122,60 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(SHA3D_2WAY)
|
||||||
|
|
||||||
|
void sha3d_hash_2x64(void *state, const void *input)
|
||||||
|
{
|
||||||
|
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||||
|
keccak256_2x64_context ctx;
|
||||||
|
|
||||||
|
keccak256_2x64_init( &ctx );
|
||||||
|
keccak256_2x64_update( &ctx, input, 80 );
|
||||||
|
keccak256_2x64_close( &ctx, buffer );
|
||||||
|
|
||||||
|
keccak256_2x64_init( &ctx );
|
||||||
|
keccak256_2x64_update( &ctx, buffer, 32 );
|
||||||
|
keccak256_2x64_close( &ctx, state );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_sha3d_2x64( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t vdata[24*2] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash[16*2] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t *hash7 = &(hash[13]); // 3*4+1
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 2;
|
||||||
|
v128_t *noncev = (v128_t*)vdata + 9;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
const bool bench = opt_benchmark;
|
||||||
|
|
||||||
|
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||||
|
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||||
|
do {
|
||||||
|
sha3d_hash_2x64( hash, vdata );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 2; lane++ )
|
||||||
|
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
|
||||||
|
{
|
||||||
|
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||||
|
if ( valid_hash( lane_hash, ptarget ) )
|
||||||
|
{
|
||||||
|
pdata[19] = bswap_32( n + lane );
|
||||||
|
submit_solution( work, lane_hash, mythr );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||||
|
n += 2;
|
||||||
|
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||||
|
pdata[19] = n;
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
|
||||||
|
|
||||||
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
a = _mm256_xor_si256( a, c0 ); \
|
a = _mm256_xor_si256( a, c0 ); \
|
||||||
b = _mm256_xor_si256( b, c1 );
|
b = _mm256_xor_si256( b, c1 );
|
||||||
|
|
||||||
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
|
#if defined(VL256)
|
||||||
#if defined(__AVX512VL__)
|
|
||||||
|
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
|
@@ -51,7 +51,7 @@
|
|||||||
#define LIMIT_512 128
|
#define LIMIT_512 128
|
||||||
/*********************************/
|
/*********************************/
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t buffer[8*4];
|
uint32_t buffer[8*4];
|
||||||
|
@@ -28,8 +28,7 @@
|
|||||||
a = v128_xor( a, c0 ); \
|
a = v128_xor( a, c0 ); \
|
||||||
b = v128_xor( b, c1 ); \
|
b = v128_xor( b, c1 ); \
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_512 AVX10_256
|
|
||||||
|
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
@@ -48,43 +47,36 @@
|
|||||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__ARM_NEON)
|
|
||||||
|
#elif defined(__ARM_NEON) || defined(__SSE2__)
|
||||||
|
|
||||||
// { a1_0, 0, a1_0, a1_0 }
|
// { a1_0, 0, a1_0, a1_0 }
|
||||||
#define MULT2( a0, a1 ) \
|
#define MULT2( a0, a1 ) \
|
||||||
{ \
|
{ \
|
||||||
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
|
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
|
||||||
a0 = v128_alignr32( a1, b, 1 ); \
|
a0 = v128_alignr32( a1, b, 1 ); \
|
||||||
a1 = v128_alignr32( b, a1, 1 ); \
|
a1 = v128_alignr32( b, a1, 1 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // assume SSE2
|
#else
|
||||||
|
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
|
||||||
#define MULT2( a0, a1 ) \
|
|
||||||
{ \
|
|
||||||
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
|
|
||||||
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
|
||||||
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
//TODO enable for AVX10_512 AVX10_256
|
|
||||||
|
|
||||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||||
{ \
|
{ \
|
||||||
v128_t t = a0; \
|
v128_t t = a0; \
|
||||||
a0 = mm128_xoror( a3, a0, a1 ); \
|
a0 = v128_xoror( a3, a0, a1 ); \
|
||||||
a2 = v128_xor( a2, a3 ); \
|
a2 = v128_xor( a2, a3 ); \
|
||||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||||
a3 = mm128_xorand( a2, a3, t ); \
|
a3 = v128_xorand( a2, a3, t ); \
|
||||||
a2 = mm128_xorand( a1, a2, a0 ); \
|
a2 = v128_xorand( a1, a2, a0 ); \
|
||||||
a1 = v128_or( a1, a3 ); \
|
a1 = v128_or( a1, a3 ); \
|
||||||
a3 = v128_xor( a3, a2 ); \
|
a3 = v128_xor( a3, a2 ); \
|
||||||
t = v128_xor( t, a1 ); \
|
t = v128_xor( t, a1 ); \
|
||||||
a2 = v128_and( a2, a1 ); \
|
a2 = v128_and( a2, a1 ); \
|
||||||
a1 = mm128_xnor( a1, a0 ); \
|
a1 = v128_xnor( a1, a0 ); \
|
||||||
a0 = t; \
|
a0 = t; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
|||||||
|
|
||||||
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||||
const void* data, size_t inlen );
|
const void* data, size_t inlen );
|
||||||
#endif // LUFFA_FOR_SSE2_H___
|
#endif // LUFFA_FOR_SSE2_H__
|
||||||
|
@@ -15,7 +15,7 @@
|
|||||||
#include "algo/groestl/sph_groestl.h"
|
#include "algo/groestl/sph_groestl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define ALLIUM_16WAY 1
|
#define ALLIUM_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define ALLIUM_8WAY 1
|
#define ALLIUM_8WAY 1
|
||||||
@@ -465,12 +465,8 @@ typedef union
|
|||||||
{
|
{
|
||||||
keccak256_2x64_context keccak;
|
keccak256_2x64_context keccak;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
//#if defined(__x86_64__)
|
|
||||||
skein256_2x64_context skein;
|
skein256_2x64_context skein;
|
||||||
//#else
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
// sph_skein512_context skein;
|
|
||||||
//#endif
|
|
||||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
|
||||||
hashState_groestl256 groestl;
|
hashState_groestl256 groestl;
|
||||||
#else
|
#else
|
||||||
sph_groestl256_context groestl;
|
sph_groestl256_context groestl;
|
||||||
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
|||||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||||
|
|
||||||
//#if defined(__x86_64__)
|
|
||||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||||
skein256_2x64_init( &ctx.skein );
|
skein256_2x64_init( &ctx.skein );
|
||||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||||
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
|
|||||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||||
skein256_2x64_close( &ctx.skein, vhashA );
|
skein256_2x64_close( &ctx.skein, vhashA );
|
||||||
dintrlv_2x64( hash2, hash3, vhashA, 256 );
|
dintrlv_2x64( hash2, hash3, vhashA, 256 );
|
||||||
/*
|
|
||||||
#else
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
sph_skein256_init( &ctx.skein );
|
|
||||||
sph_skein256( &ctx.skein, hash0, 32 );
|
|
||||||
sph_skein256_close( &ctx.skein, hash0 );
|
|
||||||
sph_skein256_init( &ctx.skein );
|
|
||||||
sph_skein256( &ctx.skein, hash1, 32 );
|
|
||||||
sph_skein256_close( &ctx.skein, hash1 );
|
|
||||||
sph_skein256_init( &ctx.skein );
|
|
||||||
sph_skein256( &ctx.skein, hash2, 32 );
|
|
||||||
sph_skein256_close( &ctx.skein, hash2 );
|
|
||||||
sph_skein256_init( &ctx.skein );
|
|
||||||
sph_skein256( &ctx.skein, hash3, 32 );
|
|
||||||
sph_skein256_close( &ctx.skein, hash3 );
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
|
||||||
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
||||||
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
|
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
|
||||||
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
|
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2REV3_16WAY 1
|
#define LYRA2REV3_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2REV3_8WAY 1
|
#define LYRA2REV3_8WAY 1
|
||||||
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
|
|||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2REV2_16WAY 1
|
#define LYRA2REV2_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2REV2_8WAY 1
|
#define LYRA2REV2_8WAY 1
|
||||||
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
|
|||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define PHI2_8WAY 1
|
#define PHI2_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define PHI2_4WAY 1
|
#define PHI2_4WAY 1
|
||||||
|
@@ -41,7 +41,7 @@
|
|||||||
// lyra2z330, lyra2h,
|
// lyra2z330, lyra2h,
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||||
|
@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
|||||||
|
|
||||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||||
|
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
|
|||||||
lyra2h_4way_midstate( vdata );
|
lyra2h_4way_midstate( vdata );
|
||||||
|
|
||||||
do {
|
do {
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
lyra2h_4way_hash( hash, vdata );
|
lyra2h_4way_hash( hash, vdata );
|
||||||
|
|
||||||
for ( int i = 0; i < 4; i++ )
|
for ( int i = 0; i < 4; i++ )
|
||||||
|
@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||||
|
|
||||||
lyra2rev2_4way_hash( hash, vdata );
|
lyra2rev2_4way_hash( hash, vdata );
|
||||||
|
|
||||||
|
@@ -3,7 +3,7 @@
|
|||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#include "algo/blake/blake256-hash.h"
|
#include "algo/blake/blake256-hash.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define LYRA2Z_16WAY 1
|
#define LYRA2Z_16WAY 1
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#define LYRA2Z_8WAY 1
|
#define LYRA2Z_8WAY 1
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__VAES__) && defined(SIMD512)
|
||||||
#include "algo/echo/echo-hash-4way.h"
|
#include "algo/echo/echo-hash-4way.h"
|
||||||
#elif defined(__AES__)
|
#elif defined(__AES__)
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
@@ -27,7 +27,7 @@
|
|||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
#include "simd-utils.h"
|
#include "simd-utils.h"
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||||
{
|
{
|
||||||
|
@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
#define G2W_4X64(a,b,c,d) \
|
#define G2W_4X64(a,b,c,d) \
|
||||||
a = _mm512_add_epi64( a, b ); \
|
a = _mm512_add_epi64( a, b ); \
|
||||||
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
// returns void, all args updated
|
// returns void, all args updated
|
||||||
#define G_2X64(a,b,c,d) \
|
#define G_2X64(a,b,c,d) \
|
||||||
a = v128_add64( a, b ); \
|
a = v128_add64( a, b ); \
|
||||||
d = v128_ror64( v128_xor( d, a), 32 ); \
|
d = v128_ror64xor( d, a, 32 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 24 ); \
|
b = v128_ror64xor( b, c, 24 ); \
|
||||||
a = v128_add64( a, b ); \
|
a = v128_add64( a, b ); \
|
||||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
d = v128_ror64xor( d, a, 16 ); \
|
||||||
c = v128_add64( c, d ); \
|
c = v128_add64( c, d ); \
|
||||||
b = v128_ror64( v128_xor( b, c ), 63 );
|
b = v128_ror64xor( b, c, 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
{ \
|
{ \
|
||||||
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
|
|||||||
|
|
||||||
#endif // AVX2 else SSE2
|
#endif // AVX2 else SSE2
|
||||||
|
|
||||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|
||||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
#define G( r, i, a, b, c, d ) \
|
#define G( r, i, a, b, c, d ) \
|
||||||
{ \
|
{ \
|
||||||
a = a + b; \
|
a = a + b; \
|
||||||
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
|
|
||||||
union _ovly_512
|
union _ovly_512
|
||||||
{
|
{
|
||||||
|
@@ -21,7 +21,7 @@
|
|||||||
#define EPS1 DBL_EPSILON
|
#define EPS1 DBL_EPSILON
|
||||||
#define EPS2 3.0e-11
|
#define EPS2 3.0e-11
|
||||||
|
|
||||||
inline double exp_n( double xt )
|
static inline double exp_n( double xt )
|
||||||
{
|
{
|
||||||
if ( xt < -700.0 )
|
if ( xt < -700.0 )
|
||||||
return 0;
|
return 0;
|
||||||
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
|
|||||||
return exp( xt );
|
return exp( xt );
|
||||||
}
|
}
|
||||||
|
|
||||||
inline double exp_n2( double x1, double x2 )
|
static inline double exp_n2( double x1, double x2 )
|
||||||
{
|
{
|
||||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
|
||||||
p5 = 37., p6 = 700.;
|
p5 = 37., p6 = 700.;
|
||||||
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
|
|||||||
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
applog( LOG_ERR, "M7M algo is not supported on MacOS");
|
||||||
return false;
|
return false;
|
||||||
#else
|
#else
|
||||||
gate->optimizations = SHA_OPT;
|
gate->optimizations = SHA256_OPT;
|
||||||
init_m7m_ctx();
|
init_m7m_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_m7m_hash;
|
gate->scanhash = (void*)&scanhash_m7m_hash;
|
||||||
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
gate->build_stratum_request = (void*)&std_be_build_stratum_request;
|
||||||
|
@@ -1,75 +0,0 @@
|
|||||||
// Copyright (c) 2014 The Magi developers
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <cfloat>
|
|
||||||
#include <limits>
|
|
||||||
#include <math.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "magimath.h"
|
|
||||||
|
|
||||||
#define EPS1 (std::numeric_limits<double>::epsilon())
|
|
||||||
#define EPS2 3.0e-11
|
|
||||||
|
|
||||||
static void gauleg(double x1, double x2, double x[], double w[], const int n)
|
|
||||||
{
|
|
||||||
int m,j,i;
|
|
||||||
double z1, z, xm, xl, pp, p3, p2, p1;
|
|
||||||
m=(n+1)/2;
|
|
||||||
xm=0.5*(x2+x1);
|
|
||||||
xl=0.5*(x2-x1);
|
|
||||||
for (i=1;i<=m;i++) {
|
|
||||||
z=cos(3.141592654*(i-0.25)/(n+0.5));
|
|
||||||
do {
|
|
||||||
p1=1.0;
|
|
||||||
p2=0.0;
|
|
||||||
for (j=1;j<=n;j++) {
|
|
||||||
p3=p2;
|
|
||||||
p2=p1;
|
|
||||||
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
|
|
||||||
}
|
|
||||||
pp=n*(z*p1-p2)/(z*z-1.0);
|
|
||||||
z1=z;
|
|
||||||
z=z1-p1/pp;
|
|
||||||
} while (fabs(z-z1) > EPS2);
|
|
||||||
x[i]=xm-xl*z;
|
|
||||||
x[n+1-i]=xm+xl*z;
|
|
||||||
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
|
|
||||||
w[n+1-i]=w[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
|
|
||||||
{
|
|
||||||
double s=0.0;
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define SW_DIVS 23
|
|
||||||
double x[SW_DIVS+1], w[SW_DIVS+1];
|
|
||||||
#else
|
|
||||||
double x[NptGQ+1], w[NptGQ+1];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
gauleg(a2, b2, x, w, NptGQ);
|
|
||||||
|
|
||||||
for (int j=1; j<=NptGQ; j++) {
|
|
||||||
s += w[j]*func(x[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static double swit_(double wvnmb)
|
|
||||||
{
|
|
||||||
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
|
|
||||||
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t sw_(int nnounce, int divs)
|
|
||||||
{
|
|
||||||
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
|
|
||||||
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
|
|
||||||
}
|
|
@@ -1,54 +0,0 @@
|
|||||||
// Copyright (c) 2014 The Magi developers
|
|
||||||
// Distributed under the MIT/X11 software license, see the accompanying
|
|
||||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
|
||||||
#ifndef MAGI_MATH_H
|
|
||||||
#define MAGI_MATH_H
|
|
||||||
|
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
uint32_t sw_(int nnounce, int divs);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
inline double exp_n(double xt)
|
|
||||||
{
|
|
||||||
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
|
|
||||||
if(xt < p1)
|
|
||||||
return 0;
|
|
||||||
else if(xt > p6)
|
|
||||||
return 1e200;
|
|
||||||
else if(xt > p3 && xt < p4)
|
|
||||||
return (1.0 + xt);
|
|
||||||
else
|
|
||||||
return exp(xt);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1 / (1 + exp(x1-x2))
|
|
||||||
inline double exp_n2(double x1, double x2)
|
|
||||||
{
|
|
||||||
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
|
|
||||||
double xt = x1 - x2;
|
|
||||||
if (xt < p1+1.e-200)
|
|
||||||
return 1.;
|
|
||||||
else if (xt > p1 && xt < p2 + 1.e-200)
|
|
||||||
return ( 1. - exp(xt) );
|
|
||||||
else if (xt > p2 && xt < p3 + 1.e-200)
|
|
||||||
return ( 1. / (1. + exp(xt)) );
|
|
||||||
else if (xt > p3 && xt < p4)
|
|
||||||
return ( 1. / (2. + xt) );
|
|
||||||
else if (xt > p4 - 1.e-200 && xt < p5)
|
|
||||||
return ( exp(-xt) / (1. + exp(-xt)) );
|
|
||||||
else if (xt > p5 - 1.e-200 && xt < p6)
|
|
||||||
return ( exp(-xt) );
|
|
||||||
else //if (xt > p6 - 1.e-200)
|
|
||||||
return 0.;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define NIST5_8WAY 1
|
#define NIST5_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define NIST5_4WAY 1
|
#define NIST5_4WAY 1
|
||||||
|
@@ -71,8 +71,7 @@ do { \
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GAMMA_4W(n0, n1, n2, n4) \
|
#define GAMMA_4W(n0, n1, n2, n4) \
|
||||||
(g ## n0 = v128_xor( a ## n0, \
|
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
|
||||||
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
|
|
||||||
|
|
||||||
#define PI_ALL_4W do { \
|
#define PI_ALL_4W do { \
|
||||||
a0 = g0; \
|
a0 = g0; \
|
||||||
@@ -312,7 +311,7 @@ do { \
|
|||||||
BUPDATE1_8W( 7, 1 ); \
|
BUPDATE1_8W( 7, 1 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if defined(__AVX512VL__)
|
#if defined(VL256)
|
||||||
|
|
||||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||||
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define ANIME_8WAY 1
|
#define ANIME_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define ANIME_4WAY 1
|
#define ANIME_4WAY 1
|
||||||
|
@@ -9,11 +9,11 @@ bool register_hmq1725_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_hmq1725_4way;
|
gate->scanhash = (void*)&scanhash_hmq1725_4way;
|
||||||
gate->hash = (void*)&hmq1725_4way_hash;
|
gate->hash = (void*)&hmq1725_4way_hash;
|
||||||
#else
|
#else
|
||||||
init_hmq1725_ctx();
|
|
||||||
gate->scanhash = (void*)&scanhash_hmq1725;
|
gate->scanhash = (void*)&scanhash_hmq1725;
|
||||||
gate->hash = (void*)&hmq1725hash;
|
gate->hash = (void*)&hmq1725hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||||
|
| NEON_OPT;
|
||||||
opt_target_factor = 65536.0;
|
opt_target_factor = 65536.0;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define HMQ1725_8WAY 1
|
#define HMQ1725_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define HMQ1725_4WAY 1
|
#define HMQ1725_4WAY 1
|
||||||
@@ -29,7 +29,6 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
|||||||
void hmq1725hash( void *state, const void *input );
|
void hmq1725hash( void *state, const void *input );
|
||||||
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void init_hmq1725_ctx();
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -4,367 +4,273 @@
|
|||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "algo/blake/sph_blake.h"
|
#include "algo/blake/blake512-hash.h"
|
||||||
#include "algo/bmw/sph_bmw.h"
|
#include "algo/bmw/sph_bmw.h"
|
||||||
#include "algo/groestl/sph_groestl.h"
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
|
#include "algo/fugue/fugue-aesni.h"
|
||||||
|
#else
|
||||||
|
#include "algo/fugue/sph_fugue.h"
|
||||||
|
#endif
|
||||||
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
|
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||||
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
#else
|
||||||
|
#include "algo/groestl/sph_groestl.h"
|
||||||
|
#include "algo/echo/sph_echo.h"
|
||||||
|
#endif
|
||||||
#include "algo/jh/sph_jh.h"
|
#include "algo/jh/sph_jh.h"
|
||||||
#include "algo/keccak/sph_keccak.h"
|
#include "algo/keccak/sph_keccak.h"
|
||||||
#include "algo/skein/sph_skein.h"
|
#include "algo/skein/sph_skein.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/echo/sph_echo.h"
|
|
||||||
#include "algo/hamsi/sph_hamsi.h"
|
#include "algo/hamsi/sph_hamsi.h"
|
||||||
#include "algo/fugue/sph_fugue.h"
|
|
||||||
#include "algo/shabal/sph_shabal.h"
|
#include "algo/shabal/sph_shabal.h"
|
||||||
#include "algo/whirlpool/sph_whirlpool.h"
|
#include "algo/whirlpool/sph_whirlpool.h"
|
||||||
#include "algo/haval/sph-haval.h"
|
#include "algo/haval/sph-haval.h"
|
||||||
#include "algo/sha/sph_sha2.h"
|
#include "algo/sha/sph_sha2.h"
|
||||||
#if defined(__AES__)
|
|
||||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
|
||||||
#include "algo/fugue/fugue-aesni.h"
|
|
||||||
#else
|
|
||||||
#include "algo/groestl/sph_groestl.h"
|
|
||||||
#include "algo/echo/sph_echo.h"
|
|
||||||
#include "algo/fugue/sph_fugue.h"
|
|
||||||
#endif
|
|
||||||
#include "algo/luffa/luffa_for_sse2.h"
|
#include "algo/luffa/luffa_for_sse2.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#if defined(__aarch64__)
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/simd/sph_simd.h"
|
|
||||||
#else
|
|
||||||
#include "algo/simd/nist.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct {
|
union _hmq1725_ctx_holder
|
||||||
sph_blake512_context blake1, blake2;
|
|
||||||
sph_bmw512_context bmw1, bmw2, bmw3;
|
|
||||||
sph_skein512_context skein1, skein2;
|
|
||||||
sph_jh512_context jh1, jh2;
|
|
||||||
sph_keccak512_context keccak1, keccak2;
|
|
||||||
hashState_luffa luffa1, luffa2;
|
|
||||||
cubehashParam cube;
|
|
||||||
sph_shavite512_context shavite1, shavite2;
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
sph_simd512_context simd1, simd2;
|
|
||||||
#else
|
|
||||||
hashState_sd simd1, simd2;
|
|
||||||
#endif
|
|
||||||
sph_hamsi512_context hamsi1;
|
|
||||||
sph_shabal512_context shabal1;
|
|
||||||
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
|
|
||||||
sph_sha512_context sha1, sha2;
|
|
||||||
sph_haval256_5_context haval1, haval2;
|
|
||||||
#if defined(__AES__)
|
|
||||||
hashState_echo echo1, echo2;
|
|
||||||
hashState_groestl groestl1, groestl2;
|
|
||||||
hashState_fugue fugue1, fugue2;
|
|
||||||
#else
|
|
||||||
sph_groestl512_context groestl1, groestl2;
|
|
||||||
sph_echo512_context echo1, echo2;
|
|
||||||
sph_fugue512_context fugue1, fugue2;
|
|
||||||
#endif
|
|
||||||
} hmq1725_ctx_holder;
|
|
||||||
|
|
||||||
static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
|
|
||||||
static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
void init_hmq1725_ctx()
|
|
||||||
{
|
{
|
||||||
sph_blake512_init(&hmq1725_ctx.blake1);
|
blake512_context blake;
|
||||||
sph_blake512_init(&hmq1725_ctx.blake2);
|
sph_bmw512_context bmw;
|
||||||
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
sph_bmw512_init(&hmq1725_ctx.bmw1);
|
hashState_fugue fugue;
|
||||||
sph_bmw512_init(&hmq1725_ctx.bmw2);
|
|
||||||
sph_bmw512_init(&hmq1725_ctx.bmw3);
|
|
||||||
|
|
||||||
sph_skein512_init(&hmq1725_ctx.skein1);
|
|
||||||
sph_skein512_init(&hmq1725_ctx.skein2);
|
|
||||||
|
|
||||||
sph_jh512_init(&hmq1725_ctx.jh1);
|
|
||||||
sph_jh512_init(&hmq1725_ctx.jh2);
|
|
||||||
|
|
||||||
sph_keccak512_init(&hmq1725_ctx.keccak1);
|
|
||||||
sph_keccak512_init(&hmq1725_ctx.keccak2);
|
|
||||||
|
|
||||||
init_luffa( &hmq1725_ctx.luffa1, 512 );
|
|
||||||
init_luffa( &hmq1725_ctx.luffa2, 512 );
|
|
||||||
|
|
||||||
cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
|
|
||||||
|
|
||||||
sph_shavite512_init(&hmq1725_ctx.shavite1);
|
|
||||||
sph_shavite512_init(&hmq1725_ctx.shavite2);
|
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
sph_simd512_init(&hmq1725_ctx.simd1);
|
|
||||||
sph_simd512_init(&hmq1725_ctx.simd2);
|
|
||||||
#else
|
|
||||||
init_sd( &hmq1725_ctx.simd1, 512 );
|
|
||||||
init_sd( &hmq1725_ctx.simd2, 512 );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sph_hamsi512_init(&hmq1725_ctx.hamsi1);
|
|
||||||
|
|
||||||
#if defined(__AES__)
|
|
||||||
fugue512_Init( &hmq1725_ctx.fugue1, 512 );
|
|
||||||
fugue512_Init( &hmq1725_ctx.fugue2, 512 );
|
|
||||||
#else
|
#else
|
||||||
sph_fugue512_init(&hmq1725_ctx.fugue1);
|
sph_fugue512_context fugue;
|
||||||
sph_fugue512_init(&hmq1725_ctx.fugue2);
|
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
sph_shabal512_init(&hmq1725_ctx.shabal1);
|
hashState_groestl groestl;
|
||||||
|
hashState_echo echo;
|
||||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool1);
|
|
||||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool2);
|
|
||||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
|
|
||||||
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
|
|
||||||
|
|
||||||
sph_sha512_init( &hmq1725_ctx.sha1 );
|
|
||||||
sph_sha512_init( &hmq1725_ctx.sha2 );
|
|
||||||
|
|
||||||
sph_haval256_5_init(&hmq1725_ctx.haval1);
|
|
||||||
sph_haval256_5_init(&hmq1725_ctx.haval2);
|
|
||||||
|
|
||||||
#if defined(__AES__)
|
|
||||||
init_echo( &hmq1725_ctx.echo1, 512 );
|
|
||||||
init_echo( &hmq1725_ctx.echo2, 512 );
|
|
||||||
init_groestl( &hmq1725_ctx.groestl1, 64 );
|
|
||||||
init_groestl( &hmq1725_ctx.groestl2, 64 );
|
|
||||||
#else
|
#else
|
||||||
sph_groestl512_init( &hmq1725_ctx.groestl1 );
|
sph_groestl512_context groestl;
|
||||||
sph_groestl512_init( &hmq1725_ctx.groestl2 );
|
sph_echo512_context echo;
|
||||||
sph_echo512_init( &hmq1725_ctx.echo1 );
|
|
||||||
sph_echo512_init( &hmq1725_ctx.echo2 );
|
|
||||||
#endif
|
#endif
|
||||||
}
|
sph_skein512_context skein;
|
||||||
|
sph_jh512_context jh;
|
||||||
void hmq_bmw512_midstate( const void* input )
|
sph_keccak512_context keccak;
|
||||||
{
|
hashState_luffa luffa;
|
||||||
memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
|
cubehashParam cube;
|
||||||
sph_bmw512( &hmq_bmw_mid, input, 64 );
|
sph_shavite512_context shavite;
|
||||||
}
|
simd512_context simd;
|
||||||
|
sph_hamsi512_context hamsi;
|
||||||
__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
|
sph_shabal512_context shabal;
|
||||||
|
sph_whirlpool_context whirlpool;
|
||||||
|
sph_sha512_context sha;
|
||||||
|
sph_haval256_5_context haval;
|
||||||
|
};
|
||||||
|
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
|
||||||
|
|
||||||
extern void hmq1725hash(void *state, const void *input)
|
extern void hmq1725hash(void *state, const void *input)
|
||||||
{
|
{
|
||||||
const uint32_t mask = 24;
|
const uint32_t mask = 24;
|
||||||
uint32_t hashA[32] __attribute__((aligned(64)));
|
uint32_t hashA[32] __attribute__((aligned(32)));
|
||||||
uint32_t hashB[32] __attribute__((aligned(64)));
|
uint32_t hashB[32] __attribute__((aligned(32)));
|
||||||
const int midlen = 64; // bytes
|
hmq1725_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
const int tail = 80 - midlen; // 16
|
|
||||||
|
|
||||||
memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
|
sph_bmw512_init( &ctx.bmw );
|
||||||
|
sph_bmw512( &ctx.bmw, input, 80 );
|
||||||
|
sph_bmw512_close( &ctx.bmw, hashA ); //1
|
||||||
|
|
||||||
memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
|
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //0
|
||||||
sph_bmw512_close(&h_ctx.bmw1, hashA); //1
|
sph_whirlpool_close( &ctx.whirlpool, hashB ); //1
|
||||||
|
|
||||||
sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0
|
|
||||||
sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1
|
|
||||||
|
|
||||||
if ( hashB[0] & mask ) //1
|
if ( hashB[0] & mask ) //1
|
||||||
{
|
{
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
|
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
|
||||||
(const char*)hashB, 512 );
|
|
||||||
#else
|
#else
|
||||||
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
|
sph_groestl512_init( &ctx.groestl );
|
||||||
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
|
sph_groestl512( &ctx.groestl, hashB, 64 ); //1
|
||||||
|
sph_groestl512_close( &ctx.groestl, hashA ); //2
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_skein512 (&h_ctx.skein1, hashB, 64); //1
|
sph_skein512_init( &ctx.skein );
|
||||||
sph_skein512_close(&h_ctx.skein1, hashA); //2
|
sph_skein512( &ctx.skein, hashB, 64 ); //1
|
||||||
|
sph_skein512_close( &ctx.skein, hashA ); //2
|
||||||
}
|
}
|
||||||
|
|
||||||
sph_jh512 (&h_ctx.jh1, hashA, 64); //3
|
sph_jh512_init( &ctx.jh );
|
||||||
sph_jh512_close(&h_ctx.jh1, hashB); //4
|
sph_jh512( &ctx.jh, hashA, 64 ); //3
|
||||||
|
sph_jh512_close( &ctx.jh, hashB ); //4
|
||||||
|
|
||||||
sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
|
sph_keccak512_init( &ctx.keccak );
|
||||||
sph_keccak512_close(&h_ctx.keccak1, hashA); //3
|
sph_keccak512( &ctx.keccak, hashB, 64 ); //2
|
||||||
|
sph_keccak512_close( &ctx.keccak, hashA ); //3
|
||||||
|
|
||||||
if ( hashA[0] & mask ) //4
|
if ( hashA[0] & mask ) //4
|
||||||
{
|
{
|
||||||
sph_blake512 (&h_ctx.blake1, hashA, 64); //
|
blake512_init( &ctx.blake );
|
||||||
sph_blake512_close(&h_ctx.blake1, hashB); //5
|
blake512_update( &ctx.blake, hashA, 64 );
|
||||||
|
blake512_close( &ctx.blake, hashB );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
|
sph_bmw512_init( &ctx.bmw );
|
||||||
sph_bmw512_close(&h_ctx.bmw2, hashB); //5
|
sph_bmw512( &ctx.bmw, hashA, 64 ); //4
|
||||||
|
sph_bmw512_close( &ctx.bmw, hashB ); //5
|
||||||
}
|
}
|
||||||
|
|
||||||
update_and_final_luffa( &h_ctx.luffa1, hashA, hashB, 64 );
|
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
|
||||||
|
|
||||||
cubehashUpdateDigest( &h_ctx.cube, hashB, hashA, 64 );
|
cubehash_full( &ctx.cube, hashB, 512, hashA, 64 );
|
||||||
|
|
||||||
if ( hashB[0] & mask ) //7
|
if ( hashB[0] & mask ) //7
|
||||||
{
|
{
|
||||||
sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
|
sph_keccak512_init( &ctx.keccak );
|
||||||
sph_keccak512_close(&h_ctx.keccak2, hashA); //8
|
sph_keccak512( &ctx.keccak, hashB, 64 ); //
|
||||||
|
sph_keccak512_close( &ctx.keccak, hashA ); //8
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_jh512 (&h_ctx.jh2, hashB, 64); //7
|
sph_jh512_init( &ctx.jh );
|
||||||
sph_jh512_close(&h_ctx.jh2, hashA); //8
|
sph_jh512( &ctx.jh, hashB, 64 ); //7
|
||||||
|
sph_jh512_close( &ctx.jh, hashA ); //8
|
||||||
}
|
}
|
||||||
|
|
||||||
sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
|
sph_shavite512_init( &ctx.shavite );
|
||||||
sph_shavite512_close(&h_ctx.shavite1, hashB); //4
|
sph_shavite512( &ctx.shavite, hashA, 64 ); //3
|
||||||
|
sph_shavite512_close( &ctx.shavite, hashB ); //4
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
simd512_ctx( &ctx.simd, hashA, hashB, 64 );
|
||||||
sph_simd512 (&h_ctx.simd1, hashB, 64); //3
|
|
||||||
sph_simd512_close(&h_ctx.simd1, hashA); //4
|
|
||||||
#else
|
|
||||||
update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
|
|
||||||
(const BitSequence *)hashB, 512 );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if ( hashA[0] & mask ) //4
|
if ( hashA[0] & mask ) //4
|
||||||
{
|
{
|
||||||
sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
|
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
|
sph_haval256_5_init( &ctx.haval );
|
||||||
sph_haval256_5_close(&h_ctx.haval1, hashB); //5
|
sph_haval256_5( &ctx.haval, hashA, 64 ); //4
|
||||||
|
sph_haval256_5_close( &ctx.haval, hashB ); //5
|
||||||
memset(&hashB[8], 0, 32);
|
memset(&hashB[8], 0, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
|
echo_full( &ctx.echo, hashA, 512, hashB, 64 );
|
||||||
(const BitSequence *)hashB, 512 );
|
|
||||||
#else
|
#else
|
||||||
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
|
sph_echo512_init( &ctx.echo );
|
||||||
sph_echo512_close(&h_ctx.echo1, hashA); //6
|
sph_echo512( &ctx.echo, hashB, 64 ); //5
|
||||||
|
sph_echo512_close( &ctx.echo, hashA ); //6
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
|
blake512_init( &ctx.blake );
|
||||||
sph_blake512_close(&h_ctx.blake2, hashB); //7
|
blake512_update( &ctx.blake, hashA, 64 );
|
||||||
|
blake512_close( &ctx.blake, hashB );
|
||||||
|
|
||||||
if ( hashB[0] & mask ) //7
|
if ( hashB[0] & mask ) //7
|
||||||
{
|
{
|
||||||
sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
|
sph_shavite512_init( &ctx.shavite );
|
||||||
sph_shavite512_close(&h_ctx.shavite2, hashA); //8
|
sph_shavite512( &ctx.shavite, hashB, 64 ); //
|
||||||
|
sph_shavite512_close( &ctx.shavite, hashA ); //8
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
luffa_full( &ctx.luffa, hashA, 512, hashB, 64 );
|
||||||
update_and_final_luffa( &h_ctx.luffa2, hashA, hashB, 64 );
|
|
||||||
}
|
|
||||||
|
|
||||||
sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
|
sph_hamsi512_init( &ctx.hamsi );
|
||||||
sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
|
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
|
||||||
|
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
fugue512_Update( &h_ctx.fugue1, hashB, 512 ); //2 ////
|
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||||
fugue512_Final( &h_ctx.fugue1, hashA ); //3
|
|
||||||
#else
|
#else
|
||||||
sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 ////
|
sph_fugue512_init( &ctx.fugue );
|
||||||
sph_fugue512_close(&h_ctx.fugue1, hashA); //3
|
sph_fugue512( &ctx.fugue, hashB, 64 ); //2 ////
|
||||||
|
sph_fugue512_close( &ctx.fugue, hashA ); //3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ( hashA[0] & mask ) //4
|
if ( hashA[0] & mask ) //4
|
||||||
{
|
{
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
|
echo_full( &ctx.echo, hashB, 512, hashA, 64 );
|
||||||
(const BitSequence *)hashA, 512 );
|
|
||||||
#else
|
#else
|
||||||
sph_echo512 (&h_ctx.echo2, hashA, 64); //
|
sph_echo512_init( &ctx.echo );
|
||||||
sph_echo512_close(&h_ctx.echo2, hashB); //5
|
sph_echo512( &ctx.echo, hashA, 64 ); //
|
||||||
|
sph_echo512_close( &ctx.echo, hashB ); //5
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
simd512_ctx( &ctx.simd, hashB, hashA, 64 );
|
||||||
#if defined(__aarch64__)
|
|
||||||
sph_simd512(&h_ctx.simd2, hashA, 64); //6
|
|
||||||
sph_simd512_close(&h_ctx.simd2, hashB); //7
|
|
||||||
#else
|
|
||||||
update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
|
|
||||||
(const BitSequence *)hashA, 512 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
|
sph_shabal512_init( &ctx.shabal );
|
||||||
sph_shabal512_close(&h_ctx.shabal1, hashA); //6
|
sph_shabal512( &ctx.shabal, hashB, 64 ); //5
|
||||||
|
sph_shabal512_close( &ctx.shabal, hashA ); //6
|
||||||
|
|
||||||
sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
|
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //6
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hashB ); //7
|
||||||
|
|
||||||
if ( hashB[0] & mask ) //7
|
if ( hashB[0] & mask ) //7
|
||||||
{
|
{
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
fugue512_Update( &h_ctx.fugue2, hashB, 512 ); //
|
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||||
fugue512_Final( &h_ctx.fugue2, hashA ); //8
|
|
||||||
#else
|
#else
|
||||||
sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
|
sph_fugue512_init( &ctx.fugue );
|
||||||
sph_fugue512_close(&h_ctx.fugue2, hashA); //8
|
sph_fugue512( &ctx.fugue, hashB, 64 ); //
|
||||||
|
sph_fugue512_close( &ctx.fugue, hashA ); //8
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_sha512( &h_ctx.sha1, hashB, 64 );
|
sph_sha512_init( &ctx.sha );
|
||||||
sph_sha512_close( &h_ctx.sha1, hashA );
|
sph_sha512( &ctx.sha, hashB, 64 );
|
||||||
|
sph_sha512_close( &ctx.sha, hashA );
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
|
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
|
||||||
(const char*)hashA, 512 );
|
|
||||||
#else
|
#else
|
||||||
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
|
sph_groestl512_init( &ctx.groestl );
|
||||||
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
|
sph_groestl512( &ctx.groestl, hashA, 64 ); //3
|
||||||
|
sph_groestl512_close( &ctx.groestl, hashB ); //4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sph_sha512( &h_ctx.sha2, hashB, 64 );
|
sph_sha512_init( &ctx.sha );
|
||||||
sph_sha512_close( &h_ctx.sha2, hashA );
|
sph_sha512( &ctx.sha, hashB, 64 );
|
||||||
|
sph_sha512_close( &ctx.sha, hashA );
|
||||||
|
|
||||||
if ( hashA[0] & mask ) //4
|
if ( hashA[0] & mask ) //4
|
||||||
{
|
{
|
||||||
sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
|
sph_haval256_5_init( &ctx.haval );
|
||||||
sph_haval256_5_close(&h_ctx.haval2, hashB); //5
|
sph_haval256_5( &ctx.haval, hashA, 64 ); //
|
||||||
memset(&hashB[8], 0, 32);
|
sph_haval256_5_close( &ctx.haval, hashB ); //5
|
||||||
|
memset( &hashB[8], 0, 32 );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5
|
sph_whirlpool( &ctx.whirlpool, hashA, 64 ); //4
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hashB ); //5
|
||||||
}
|
}
|
||||||
|
|
||||||
sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
|
sph_bmw512_init( &ctx.bmw );
|
||||||
sph_bmw512_close(&h_ctx.bmw3, hashA); //6
|
sph_bmw512( &ctx.bmw, hashB, 64 ); //5
|
||||||
|
sph_bmw512_close( &ctx.bmw, hashA ); //6
|
||||||
|
|
||||||
memcpy(state, hashA, 32);
|
memcpy( state, hashA, 32 );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
// uint32_t endiandata[32] __attribute__((aligned(64)));
|
uint32_t endiandata[20] __attribute__((aligned(32)));
|
||||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||||
uint32_t hash64[8] __attribute__((aligned(64)));
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t *ptarget = work->target;
|
|
||||||
uint32_t n = pdata[19] - 1;
|
uint32_t n = pdata[19] - 1;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
//const uint32_t Htarg = ptarget[7];
|
|
||||||
|
|
||||||
//we need bigendian data...
|
//we need bigendian data...
|
||||||
// for (int k = 0; k < 32; k++)
|
for (int k = 0; k < 20; k++)
|
||||||
for (int k = 0; k < 20; k++)
|
be32enc(&endiandata[k], pdata[k]);
|
||||||
be32enc(&endiandata[k], pdata[k]);
|
|
||||||
|
|
||||||
hmq_bmw512_midstate( endiandata );
|
|
||||||
|
|
||||||
// if (opt_debug)
|
|
||||||
// {
|
|
||||||
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
|
|
||||||
// }
|
|
||||||
|
|
||||||
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
|
|
||||||
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
|
|
||||||
if (ptarget[7]==0) {
|
if (ptarget[7]==0) {
|
||||||
do {
|
do {
|
||||||
pdata[19] = ++n;
|
pdata[19] = ++n;
|
||||||
|
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_quark;
|
gate->scanhash = (void*)&scanhash_quark;
|
||||||
gate->hash = (void*)&quark_hash;
|
gate->hash = (void*)&quark_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||||
|
| NEON_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define QUARK_8WAY 1
|
#define QUARK_8WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define QUARK_4WAY 1
|
#define QUARK_4WAY 1
|
||||||
|
@@ -7,12 +7,12 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "algo/blake/sph_blake.h"
|
#include "algo/blake/blake512-hash.h"
|
||||||
#include "algo/bmw/sph_bmw.h"
|
#include "algo/bmw/sph_bmw.h"
|
||||||
#include "algo/jh/sph_jh.h"
|
#include "algo/jh/sph_jh.h"
|
||||||
#include "algo/keccak/sph_keccak.h"
|
#include "algo/keccak/sph_keccak.h"
|
||||||
#include "algo/skein/sph_skein.h"
|
#include "algo/skein/sph_skein.h"
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||||
#else
|
#else
|
||||||
#include "algo/groestl/sph_groestl.h"
|
#include "algo/groestl/sph_groestl.h"
|
||||||
@@ -21,9 +21,9 @@
|
|||||||
void quark_hash(void *state, const void *input)
|
void quark_hash(void *state, const void *input)
|
||||||
{
|
{
|
||||||
uint32_t hash[16] __attribute__((aligned(64)));
|
uint32_t hash[16] __attribute__((aligned(64)));
|
||||||
sph_blake512_context ctx_blake;
|
blake512_context ctx_blake;
|
||||||
sph_bmw512_context ctx_bmw;
|
sph_bmw512_context ctx_bmw;
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
hashState_groestl ctx_groestl;
|
hashState_groestl ctx_groestl;
|
||||||
#else
|
#else
|
||||||
sph_groestl512_context ctx_groestl;
|
sph_groestl512_context ctx_groestl;
|
||||||
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
|
|||||||
sph_keccak512_context ctx_keccak;
|
sph_keccak512_context ctx_keccak;
|
||||||
uint32_t mask = 8;
|
uint32_t mask = 8;
|
||||||
|
|
||||||
sph_blake512_init( &ctx_blake );
|
blake512_full( &ctx_blake, hash, input, 80 );
|
||||||
sph_blake512( &ctx_blake, input, 80 );
|
|
||||||
sph_blake512_close( &ctx_blake, hash );
|
|
||||||
|
|
||||||
sph_bmw512_init( &ctx_bmw );
|
sph_bmw512_init( &ctx_bmw );
|
||||||
sph_bmw512( &ctx_bmw, hash, 64 );
|
sph_bmw512( &ctx_bmw, hash, 64 );
|
||||||
sph_bmw512_close( &ctx_bmw, hash );
|
sph_bmw512_close( &ctx_bmw, hash );
|
||||||
|
|
||||||
if ( hash[0] & mask )
|
if ( hash[0] & mask )
|
||||||
{
|
{
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
init_groestl( &ctx_groestl, 64 );
|
init_groestl( &ctx_groestl, 64 );
|
||||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||||
(const char*)hash, 512 );
|
(const char*)hash, 512 );
|
||||||
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
|
|||||||
sph_skein512_close( &ctx_skein, hash );
|
sph_skein512_close( &ctx_skein, hash );
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
init_groestl( &ctx_groestl, 64 );
|
init_groestl( &ctx_groestl, 64 );
|
||||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||||
(const char*)hash, 512 );
|
(const char*)hash, 512 );
|
||||||
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
|
|||||||
|
|
||||||
if ( hash[0] & mask )
|
if ( hash[0] & mask )
|
||||||
{
|
{
|
||||||
sph_blake512_init( &ctx_blake );
|
blake512_full( &ctx_blake, hash, hash, 64 );
|
||||||
sph_blake512( &ctx_blake, hash, 64 );
|
|
||||||
sph_blake512_close( &ctx_blake, hash );
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
|
|||||||
|
|
||||||
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||||
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
|
||||||
|
|
||||||
uint64_t *edata = (uint64_t*)endiandata;
|
uint64_t *edata = (uint64_t*)endiandata;
|
||||||
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||||
|
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
|
|||||||
|
|
||||||
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||||
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
|
||||||
|
|
||||||
uint64_t *edata = (uint64_t*)endiandata;
|
uint64_t *edata = (uint64_t*)endiandata;
|
||||||
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||||
|
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_qubit;
|
gate->scanhash = (void*)&scanhash_qubit;
|
||||||
gate->hash = (void*)&qubit_hash;
|
gate->hash = (void*)&qubit_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||||
|
| NEON_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(SIMD512)
|
||||||
#define QUBIT_4WAY 1
|
#define QUBIT_4WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
#define QUBIT_2WAY 1
|
#define QUBIT_2WAY 1
|
||||||
|
@@ -8,13 +8,9 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "algo/luffa/luffa_for_sse2.h"
|
#include "algo/luffa/luffa_for_sse2.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#if defined(__aarch64__)
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/simd/sph_simd.h"
|
|
||||||
#else
|
|
||||||
#include "algo/simd/nist.h"
|
|
||||||
#endif
|
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#ifdef __AES__
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#else
|
#else
|
||||||
#include "algo/echo/sph_echo.h"
|
#include "algo/echo/sph_echo.h"
|
||||||
@@ -25,12 +21,8 @@ typedef struct
|
|||||||
hashState_luffa luffa;
|
hashState_luffa luffa;
|
||||||
cubehashParam cubehash;
|
cubehashParam cubehash;
|
||||||
sph_shavite512_context shavite;
|
sph_shavite512_context shavite;
|
||||||
#if defined(__aarch64__)
|
simd512_context simd;
|
||||||
sph_simd512_context simd;
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
#else
|
|
||||||
hashState_sd simd;
|
|
||||||
#endif
|
|
||||||
#ifdef __AES__
|
|
||||||
hashState_echo echo;
|
hashState_echo echo;
|
||||||
#else
|
#else
|
||||||
sph_echo512_context echo;
|
sph_echo512_context echo;
|
||||||
@@ -45,12 +37,7 @@ void init_qubit_ctx()
|
|||||||
init_luffa(&qubit_ctx.luffa,512);
|
init_luffa(&qubit_ctx.luffa,512);
|
||||||
cubehashInit(&qubit_ctx.cubehash,512,16,32);
|
cubehashInit(&qubit_ctx.cubehash,512,16,32);
|
||||||
sph_shavite512_init(&qubit_ctx.shavite);
|
sph_shavite512_init(&qubit_ctx.shavite);
|
||||||
#if defined(__aarch64__)
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
sph_simd512_init( &qubit_ctx.simd );
|
|
||||||
#else
|
|
||||||
init_sd( &qubit_ctx.simd, 512 );
|
|
||||||
#endif
|
|
||||||
#ifdef __AES__
|
|
||||||
init_echo(&qubit_ctx.echo, 512);
|
init_echo(&qubit_ctx.echo, 512);
|
||||||
#else
|
#else
|
||||||
sph_echo512_init(&qubit_ctx.echo);
|
sph_echo512_init(&qubit_ctx.echo);
|
||||||
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
|
|||||||
sph_shavite512( &ctx.shavite, hash, 64);
|
sph_shavite512( &ctx.shavite, hash, 64);
|
||||||
sph_shavite512_close( &ctx.shavite, hash);
|
sph_shavite512_close( &ctx.shavite, hash);
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
|
||||||
sph_simd512_close(&ctx.simd, hash);
|
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||||
#else
|
|
||||||
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
|
|
||||||
final_sd( &ctx.simd, (BitSequence *)hash );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __AES__
|
|
||||||
update_final_echo( &ctx.echo, (BitSequence *) hash,
|
update_final_echo( &ctx.echo, (BitSequence *) hash,
|
||||||
(const BitSequence *) hash, 512 );
|
(const BitSequence *) hash, 512 );
|
||||||
#else
|
#else
|
||||||
|
@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
|
|||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
|
|
||||||
// we need bigendian data...
|
// we need bigendian data...
|
||||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||||
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
|
||||||
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
|
||||||
|
|
||||||
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
|
|||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
|
|
||||||
// we need bigendian data...
|
// we need bigendian data...
|
||||||
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
|
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
|
||||||
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
|
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
|
||||||
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
|
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
|
||||||
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
|
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
|
||||||
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
|
||||||
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
|
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
|
||||||
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
|
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
|
||||||
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
|
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
|
||||||
intrlv_8x32( vdata, edata, edata, edata, edata,
|
intrlv_8x32( vdata, edata, edata, edata, edata,
|
||||||
edata, edata, edata, edata, 1024 );
|
edata, edata, edata, edata, 1024 );
|
||||||
|
|
||||||
|
@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
|
|||||||
|
|
||||||
bool register_lbry_algo( algo_gate_t* gate )
|
bool register_lbry_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
|
||||||
#if defined (LBRY_16WAY)
|
#if defined (LBRY_16WAY)
|
||||||
gate->scanhash = (void*)&scanhash_lbry_16way;
|
gate->scanhash = (void*)&scanhash_lbry_16way;
|
||||||
gate->hash = (void*)&lbry_16way_hash;
|
gate->hash = (void*)&lbry_16way_hash;
|
||||||
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
|
|||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_lbry;
|
gate->scanhash = (void*)&scanhash_lbry;
|
||||||
gate->hash = (void*)&lbry_hash;
|
gate->hash = (void*)&lbry_hash;
|
||||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
|
||||||
#endif
|
#endif
|
||||||
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
|
||||||
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
gate->build_extraheader = (void*)&lbry_build_extraheader;
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user