Compare commits

...

9 Commits

Author SHA1 Message Date
Jay D Dee
2b1037a7c7 v24.7 2024-12-16 19:17:19 -05:00
Jay D Dee
06624a0ff2 v24.6 2024-12-08 11:14:08 -05:00
Jay D Dee
8e91bfbe19 v24.5 2024-09-13 14:14:57 -04:00
Jay D Dee
47e24b50e8 v24.4 2024-07-01 00:33:19 -04:00
Jay D Dee
c47c4a8885 v24.3 2024-05-28 18:20:19 -04:00
Jay D Dee
042d13d1e1 v24.2 2024-05-20 23:08:50 -04:00
Jay D Dee
4f930574cc v24.1 2024-04-16 21:31:35 -04:00
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
194 changed files with 6396 additions and 4909 deletions

View File

@@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
cpu-miner.c \
util.c \
api.c \
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m/m7m.c \
algo/m7m/magimath.cpp \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -166,7 +166,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/shavite/shavite.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
@@ -289,14 +288,10 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if HAVE_WINDOWS
cpuminer_CFLAGS += -Wl,--stack,10485760
endif
if HAVE_WINDOWS
# use to profile an object
# gprof_cflags = -pg -g3

View File

@@ -87,7 +87,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -75,6 +75,66 @@ If not what makes it happen or not happen?
Change Log
----------
v24.7
ARM: compile works for Windows using MSys2 & MingW, see wiki for details.
v24.6
ARM: Fixed scryptn2, x16*, broken in v24.2.
ARM: Small improvement to interleaving.
Eliminated some potential compile errors in code that was dependent on
compiler optimisations.
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
v24.5
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
#427: GBT: Improved handling of new work.
Removed shavite3 algo.
v24.4
x86_64: fixed a bug in ornot macro for AVX2 which broke some algos in v24.2.
x86_64: fixed a bug in alignr macros for SSE2.
ARM: CPU feature reporting enhancements.
Some code cleanup.
v24.3
ARM: CPU feature detection and reporting is now working.
ARM: Verthash is now working.
ARM: Small speedup for yescrypt, yespower & argon2d.
Code cleanup.
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.
v24.1
#414: fix bug in merkle error handling.
#416: change $nproc to $(nproc) in build scripts.
#420: change some inline function definitions to static inline.
#413: Fix formatting error for share result log when using no-color.
Faster 2 way interleaving.
Cleanup sha256 architecture targetting.
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.

View File

@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )
@@ -263,8 +263,8 @@ static void init_algo_gate( algo_gate_t* gate )
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
gate->set_work_data_endian = (void*)&do_nothing;
gate->resync_threads = (void*)&do_nothing;
gate->do_this_thread = (void*)&return_true;
// gate->resync_threads = (void*)&do_nothing;
// gate->do_this_thread = (void*)&return_true;
gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call;
gate->get_work_data_size = (void*)&std_get_work_data_size;
gate->optimizations = EMPTY_SET;
@@ -340,7 +340,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
case ALGO_SKUNK: rc = register_skunk_algo ( gate ); break;

View File

@@ -98,25 +98,27 @@ typedef uint32_t set_t;
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
// return set contained common elements from sets a & b
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
// all elements in set a are included in set b
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
// no elements in set a are included in set b
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{
@@ -163,10 +165,10 @@ char* ( *malloc_txs_request ) ( struct work* );
void ( *set_work_data_endian ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int );
//bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( int, struct work* );
//void ( *resync_threads ) ( int, struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/
#if defined(__AVX512F__)
#if defined(SIMD512)
static inline __m512i blamka( __m512i x, __m512i y )
{
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];

View File

@@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(SIMD512)
#if !defined(__AVX2__)

View File

@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );

View File

@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////////
//
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//Blake-256 16 way AVX512

View File

@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////
//

View File

@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define B2B8W_G(a, b, c, d, x, y) \
{ \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
}
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
{
__m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
{
size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
}
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen )
{
__m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_8way_compress( ctx, 0 );
blake2b_8x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
}
}
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
casti_m512i( out, 0 ) = ctx->h[0];
casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
{
__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
{
size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
blake2b_4x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];

View File

@@ -1,6 +1,6 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#ifndef BLAKE2B_HASH_4WAY_H__
#define BLAKE2B_HASH_4WAY_H__
#include "simd-utils.h"
#include <stddef.h>
@@ -15,7 +15,7 @@
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_8way_ctx;
} blake2b_8x64_ctx;
int blake2b_8way_init( blake2b_8way_ctx *ctx );
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx;
} blake2b_4x64_ctx;
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY

View File

@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Blake2s-256 16 way

View File

@@ -11,8 +11,8 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#ifndef BLAKE2S_HASH_4WAY_H__
#define BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__) || defined(__ARM_NEON)
@@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;

View File

@@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY

View File

@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}
#define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}
#define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );
// Round 1
// G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );
// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );
// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );
// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);

View File

@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////
//

View File

@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY

View File

@@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \

View File

@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 64 bit 8 way
typedef struct

View File

@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1

View File

@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 8 WAY

View File

@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// 4 way 128 is handy to avoid reinterleaving in many algos.
// If reinterleaving is necessary it may be more efficient to use

View File

@@ -6,7 +6,7 @@
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
struct _cube_4way_context
{

View File

@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
int r;
const int rounds = sp->rounds;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
register __m512i x0, x1;

View File

@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
};
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
for(i = 0; i < 6; i++)
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
static void AddXor512(const void *a,const void *b,void *c)
{
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
casti_m512i( b, 0 ) );
#elif defined(__AVX2__)
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
/*
#define TRANSP_MASK \
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
#define SUBSH_MASK0 \
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
#define SUBSH_MASK1 \
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
#define SUBSH_MASK2 \
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
#define SUBSH_MASK3 \
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
#define SUBSH_MASK4 \
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
#define SUBSH_MASK5 \
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
#define SUBSH_MASK6 \
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
#define SUBSH_MASK7 \
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
c07, c06, c05, c04, c03, c02, c01, c00 ) \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
*/
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -140,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -362,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define GROESTL_4WAY_VAES 1
#endif

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

View File

@@ -43,7 +43,7 @@
#define SIZE256 (SIZE_512/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];

View File

@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{

View File

@@ -33,7 +33,7 @@
#define SIZE512 (SIZE_1024/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];

View File

@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -239,7 +239,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
@@ -254,7 +254,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
\
/* AddRoundConstant P1024 */\
xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
@@ -283,7 +283,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
@@ -306,7 +306,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
@@ -812,7 +812,7 @@ static const __m256i SUBSH_MASK7_2WAY =
{ \
/* AddRoundConstant P1024 */\
xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter ) ) ); \
casti_v128u32( round_const_p, round_counter ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
@@ -827,7 +827,7 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* AddRoundConstant P1024 */\
xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
casti_m128i( round_const_p, round_counter+1 ) ) ); \
casti_v128u32( round_const_p, round_counter+1 ) ) ); \
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
@@ -856,7 +856,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter ) ) ); \
casti_v128u32( round_const_q, round_counter ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
@@ -879,7 +879,7 @@ static const __m256i SUBSH_MASK7_2WAY =
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
casti_m128i( round_const_q, round_counter+1 ) ) ); \
casti_v128u32( round_const_q, round_counter+1 ) ) ); \
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\

View File

@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
#endif
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define MYRGR_8WAY 1
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY 1

View File

@@ -382,12 +382,12 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi 8 way AVX512
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
// had a 3% faster overall hashrate than than using movepi.
#define INPUT_BIG8 \
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
tb = mm512_xoror( b, d, a ); \
a = _mm512_xor_si512( a, c ); \
b = mm512_xoror( td, tb, a ); \
td = mm512_xorand( a, td, tb ); \
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
a = c; \
c = mm512_xor3( tb, b, td ); \
d = mm512_not( td ); \
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
}
/*
#define SBOX8( a, b, c, d ) \
do { \
@@ -1122,7 +1120,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_BIG \
do { \
@@ -1155,11 +1153,99 @@ do { \
b = mm256_xoror( td, tb, a ); \
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
a = c; \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
}
#else
#define INPUT_BIG_sub( db_i ) \
{ \
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_BIG \
{ \
const __m256i db = *buf; \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
INPUT_BIG_sub( db ); \
}
#if 0
// dependent on the compiler unrolling the loop
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -1180,6 +1266,7 @@ do { \
tp += 8; \
} \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX( a, b, c, d ) \
@@ -1219,7 +1306,7 @@ do { \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
b = mm256_xor3( b, a, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
@@ -1501,7 +1588,7 @@ do { /* order is important */ \
sc->h[14] = CE; \
sc->h[15] = CF;
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_8X32 \
{ \
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sc->h[6] = c6; \
sc->h[7] = c7;
#define INPUT_2x64_sub( db_i ) \
{ \
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
tp += 8; \
}
#define INPUT_2x64 \
{ \
const v128u64_t db = *buf; \
const v128u64_t zero = v128_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
INPUT_2x64_sub( db ); \
}
#if 0
// Dependent on the compiler unrolling the loop.
#define INPUT_2x64 \
{ \
v128u64_t db = *buf; \
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
tp += 8; \
} \
}
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{ \
a = v128_rol32( a, 13 ); \
c = v128_rol32( c, 3 ); \
b = v128_xor3( a, b, c ); \
b = v128_xor3( c, a, b ); \
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
b = v128_rol32( b, 1 ); \
d = v128_rol32( d, 7 ); \

View File

@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi-512 8x64

View File

@@ -53,7 +53,7 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
#if defined(VL256)
// ( ~( a ^ b ) ) & c
#define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
// Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
#if defined (VL256)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \

View File

@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[32];

View File

@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
(state)->H[15] = h7l; \
} while (0)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define Sb_8W(x0, x1, x2, x3, c) \
{ \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
#if defined(__AVX2__)
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#if defined(VL256)
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void jh256_8x64_init( jh_8x64_context *sc )
{

View File

@@ -55,7 +55,7 @@
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
@@ -12,7 +12,7 @@
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1

View File

@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
#define DO(x) x
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define INPUT_BUF(size) do { \
size_t j; \

View File

@@ -4,7 +4,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \

View File

@@ -51,7 +51,7 @@
#define LIMIT_512 128
/*********************************/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
uint32_t buffer[8*4];

View File

@@ -28,8 +28,7 @@
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -48,29 +47,22 @@
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) || defined(__SSE2__)
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
v128_t b = v128_xor( a0, v128_and( v128_bcast32( a1 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
}
#else
#warning __FILE__ ":" __LINE__ " Unknown or unsupported CPU architecture."
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___
#endif // LUFFA_FOR_SSE2_H__

View File

@@ -15,7 +15,7 @@
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1
@@ -465,12 +465,8 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
//#if defined(__x86_64__)
skein256_2x64_context skein;
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV2_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
/////////////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define PHI2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define PHI2_4WAY 1

View File

@@ -41,7 +41,7 @@
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords

View File

@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );

View File

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "algo/blake/blake256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "lyra2.h"
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#include "algo/echo/echo-hash-4way.h"
#elif defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"

View File

@@ -27,7 +27,7 @@
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{

View File

@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 24 ); \
b = v128_ror64xor( b, c, 24 ); \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 63 );
b = v128_ror64xor( b, c, 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G( r, i, a, b, c, d ) \
{ \
a = a + b; \
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
union _ovly_512
{

View File

@@ -21,7 +21,7 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n( double xt )
static inline double exp_n( double xt )
{
if ( xt < -700.0 )
return 0;
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
return exp( xt );
}
inline double exp_n2( double x1, double x2 )
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;

View File

@@ -1,75 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <iostream>
#include <cfloat>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "magimath.h"
#define EPS1 (std::numeric_limits<double>::epsilon())
#define EPS2 3.0e-11
static void gauleg(double x1, double x2, double x[], double w[], const int n)
{
int m,j,i;
double z1, z, xm, xl, pp, p3, p2, p1;
m=(n+1)/2;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for (i=1;i<=m;i++) {
z=cos(3.141592654*(i-0.25)/(n+0.5));
do {
p1=1.0;
p2=0.0;
for (j=1;j<=n;j++) {
p3=p2;
p2=p1;
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
}
pp=n*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > EPS2);
x[i]=xm-xl*z;
x[n+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[n+1-i]=w[i];
}
}
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
{
double s=0.0;
#ifdef _MSC_VER
#define SW_DIVS 23
double x[SW_DIVS+1], w[SW_DIVS+1];
#else
double x[NptGQ+1], w[NptGQ+1];
#endif
gauleg(a2, b2, x, w, NptGQ);
for (int j=1; j<=NptGQ; j++) {
s += w[j]*func(x[j]);
}
return s;
}
static double swit_(double wvnmb)
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
}
uint32_t sw_(int nnounce, int divs)
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
}

View File

@@ -1,54 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef MAGI_MATH_H
#define MAGI_MATH_H
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t sw_(int nnounce, int divs);
#ifdef __cplusplus
}
#endif
inline double exp_n(double xt)
{
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
if(xt < p1)
return 0;
else if(xt > p6)
return 1e200;
else if(xt > p3 && xt < p4)
return (1.0 + xt);
else
return exp(xt);
}
// 1 / (1 + exp(x1-x2))
inline double exp_n2(double x1, double x2)
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
return ( exp(-xt) );
else //if (xt > p6 - 1.e-200)
return 0.;
}
#endif

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define NIST5_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY 1

View File

@@ -71,8 +71,7 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
@@ -312,7 +311,7 @@ do { \
BUPDATE1_8W( 7, 1 ); \
} while (0)
#if defined(__AVX512VL__)
#if defined(VL256)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ANIME_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1

View File

@@ -6,16 +6,16 @@
#include <stdint.h>
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
@@ -33,18 +33,18 @@
union _hmq1725_ctx_holder
{
blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_skein512_context skein;
@@ -62,9 +62,6 @@ union _hmq1725_ctx_holder
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
@@ -82,7 +79,7 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512_init( &ctx.groestl );
@@ -180,7 +177,7 @@ extern void hmq1725hash(void *state, const void *input)
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512_init( &ctx.fugue );
@@ -211,7 +208,7 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512_init( &ctx.fugue );
@@ -226,7 +223,7 @@ extern void hmq1725hash(void *state, const void *input)
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512_init( &ctx.groestl );
@@ -262,30 +259,18 @@ extern void hmq1725hash(void *state, const void *input)
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
// hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUARK_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY 1

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

View File

@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUBIT_4WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY 1

View File

@@ -8,13 +8,9 @@
#include <stdio.h>
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
#ifdef __AES__
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
init_luffa(&qubit_ctx.luffa,512);
cubehashInit(&qubit_ctx.cubehash,512,16,32);
sph_shavite512_init(&qubit_ctx.shavite);
#if defined(__aarch64__)
sph_simd512_init( &qubit_ctx.simd );
#else
init_sd( &qubit_ctx.simd, 512 );
#endif
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_echo(&qubit_ctx.echo, 512);
#else
sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
#ifdef __AES__
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512 );
#else

View File

@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_v128u32( edata, 0 ) = v128_bswap32( casti_v128u32( pdata, 0 ) );
casti_v128u32( edata, 1 ) = v128_bswap32( casti_v128u32( pdata, 1 ) );
casti_v128u32( edata, 2 ) = v128_bswap32( casti_v128u32( pdata, 2 ) );
casti_v128u32( edata, 3 ) = v128_bswap32( casti_v128u32( pdata, 3 ) );
casti_v128u32( edata, 4 ) = v128_bswap32( casti_v128u32( pdata, 4 ) );
casti_v128u32( edata, 5 ) = v128_bswap32( casti_v128u32( pdata, 5 ) );
casti_v128u32( edata, 6 ) = v128_bswap32( casti_v128u32( pdata, 6 ) );
casti_v128u32( edata, 7 ) = v128_bswap32( casti_v128u32( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );

View File

@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
#endif
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LBRY_16WAY 1
#elif defined(__AVX2__)
#define LBRY_8WAY 1

View File

@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
_mm_xor_si128( v128_ornot( y, x ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
_mm_xor_si128( x, v128_ornot( z, y ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
c = v128_rol32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \
@@ -319,7 +319,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
casti_v128u32( dst, u ) = sc->val[u];
}
#endif
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
#define F8W_3(x, y, z) \
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
_mm256_xor_si256( mm256_ornot( y, x ), z )
#define F8W_4(x, y, z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
#define F8W_5(x, y, z) \
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
_mm256_xor_si256( x, mm256_ornot( z, y ) )
#define RR_8W(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// RIPEMD-160 16 way

View File

@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -745,7 +745,7 @@ do{ \
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Tested OK but very slow
// 16 way parallel, requires 16x32 interleaving
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
v128_ovly v;
for ( int l = 0; l < 4; l++ )
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
X[i] = v128_xor( X[i], v.m128 );
X[i] = v128_xor( X[i], v.v128 );
}
xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
y[0].m128 = X0;
y[1].m128 = X1;
y[2].m128 = X2;
y[3].m128 = X3;
y[0].v128 = X0;
y[1].v128 = X1;
y[2].v128 = X2;
y[3].v128 = X3;
z[0].u32[0] = y[0].u32[0];
z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
z[3].u32[1] = y[2].u32[3];
z[3].u32[0] = y[3].u32[3];
B[0] = v128_add32( B[0], z[0].m128 );
B[1] = v128_add32( B[1], z[1].m128 );
B[2] = v128_add32( B[2], z[2].m128 );
B[3] = v128_add32( B[3], z[3].m128 );
B[0] = v128_add32( B[0], z[0].v128 );
B[1] = v128_add32( B[1], z[1].v128 );
B[2] = v128_add32( B[2], z[2].v128 );
B[3] = v128_add32( B[3], z[3].v128 );
#endif
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
za[3].u32[3] = ya[0].u32[3];
zb[3].u32[3] = yb[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
*/
}
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_2BUF;
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
yc[0].m128 = XC[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
yc[1].m128 = XC[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
yc[2].m128 = XC[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
yc[3].m128 = XC[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
yc[0].v128 = XC[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
yc[1].v128 = XC[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
yc[2].v128 = XC[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
yc[3].v128 = XC[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
zb[3].u32[3] = yb[0].u32[3];
zc[3].u32[3] = yc[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XC[0] = zc[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XC[1] = zc[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XC[2] = zc[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XC[0] = zc[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XC[1] = zc[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XC[2] = zc[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
XC[3] = zc[3].v128;
*/
}
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_3BUF;
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
xf = (B[15] ^= C[15]);
#define ROL32( a, c ) ror32( a, c )
#define ROL32( a, c ) rol32( a, c )
#define ADD32( a, b ) ( (a)+(b) )
#define XOR( a, b ) ( (a)^(b) )

View File

@@ -5,7 +5,7 @@
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );

View File

@@ -35,7 +35,7 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
#endif /* HAVE_SHA256_8WAY */
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static inline void sha256_16way_init_state( void *state )
{
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf

View File

@@ -74,8 +74,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
@@ -83,8 +83,8 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
sha256_4way_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_m128i( pad, i ) = _mm_xor_si128( casti_m128i( pad, i ),
casti_m128i( K, i ) );
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
}
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_m128i( T, k ) = _mm_xor_si128( casti_m128i( T, k ),
casti_m128i( U, k ) );
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */
@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
}
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// HMAC 16-way AVX512

View File

@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct _hmac_sha256_16way_context
{

View File

@@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \

View File

@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#if defined(VL256)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
sha256_8way_close( &ctx, dst );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way

View File

@@ -1,6 +1,6 @@
#include "sha256-hash.h"
#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) )
static const uint32_t SHA256_IV[8] =
{
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
#if defined(__x86_64__) && defined(__SHA__)
/* common code used for rounds 12 through 51 */
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
TMP = _mm_alignr_epi8( a, c, 4 ); \
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
b = _mm_add_epi32( b, TMP ); \
b = _mm_sha256msg2_epu32( b, a ); \
m = _mm_shuffle_epi32( m, 0x0e ); \
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
c = _mm_sha256msg1_epu32( c, a );
// r12-15
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
// r16-19
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
// r20-23
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
// r24-27
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
#define sha256_opt_rounds( state_out, input, state_in ) \
{ \
__m128i STATE0, STATE1; \
@@ -189,7 +211,7 @@ static const uint32_t SHA256_IV[8] =
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
}
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
@@ -197,7 +219,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
#undef load_msg
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
@@ -517,7 +539,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input,
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
}
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -526,7 +548,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
#undef load_msg
}
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -541,14 +563,14 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
// The goal is to avoid any redundant processing in final. Prehash is almost
// 4 rounds total, only missing the final addition of the nonce.
// Nonce must be set to zero for prehash.
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate )
{
__m128i STATE0, STATE1, MSG, TMP;
// Load initial values
TMP = casti_m128i( istate, 0 );
STATE1 = casti_m128i( istate, 1 );
TMP = casti_v128u32( istate, 0 );
STATE1 = casti_v128u32( istate, 1 );
TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB
STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH
@@ -556,20 +578,20 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
// Save current hash
casti_m128i( sstate, 0 ) = STATE0;
casti_m128i( sstate, 1 ) = STATE1;
casti_v128u32( sstate, 0 ) = STATE0;
casti_v128u32( sstate, 1 ) = STATE1;
// Rounds 0 to 3
MSG = casti_m128i( msg, 0 );
MSG = casti_v128u32( msg, 0 );
TMP = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
MSG = _mm_add_epi32( MSG, TMP );
STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
MSG = _mm_shuffle_epi32( MSG, 0x0E );
casti_m128i( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_m128i( ostate, 1 ) = STATE1;
casti_v128u32( ostate, 0 ) = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
casti_v128u32( ostate, 1 ) = STATE1;
}
void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y )
@@ -579,22 +601,22 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
STATE0_X = casti_m128i( state_mid_X, 0 );
STATE1_X = casti_m128i( state_mid_X, 1 );
STATE0_Y = casti_m128i( state_mid_Y, 0 );
STATE1_Y = casti_m128i( state_mid_Y, 1 );
STATE0_X = casti_v128u32( state_mid_X, 0 );
STATE1_X = casti_v128u32( state_mid_X, 1 );
STATE0_Y = casti_v128u32( state_mid_Y, 0 );
STATE1_Y = casti_v128u32( state_mid_Y, 1 );
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
TMSG0_X = casti_v128u32( msg_X, 0 );
TMSG0_Y = casti_v128u32( msg_Y, 0 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
// Rounds 4 to 7
TMSG1_X = casti_m128i( msg_X, 1 );
TMSG1_Y = casti_m128i( msg_Y, 1 );
TMSG1_X = casti_v128u32( msg_X, 1 );
TMSG1_Y = casti_v128u32( msg_Y, 1 );
TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
@@ -616,8 +638,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_X );
// Rounds 12 to 15
TMSG3_X = casti_m128i( msg_X, 3 );
TMSG3_Y = casti_m128i( msg_Y, 3 );
TMSG3_X = casti_v128u32( msg_X, 3 );
TMSG3_Y = casti_v128u32( msg_Y, 3 );
TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
@@ -845,20 +867,20 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
// Add saved state to new state
STATE0_X = _mm_add_epi32( STATE0_X, casti_m128i( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_m128i( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_m128i( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_m128i( state_save_Y, 1 ) );
STATE0_X = _mm_add_epi32( STATE0_X, casti_v128u32( state_save_X, 0 ) );
STATE1_X = _mm_add_epi32( STATE1_X, casti_v128u32( state_save_X, 1 ) );
STATE0_Y = _mm_add_epi32( STATE0_Y, casti_v128u32( state_save_Y, 0 ) );
STATE1_Y = _mm_add_epi32( STATE1_Y, casti_v128u32( state_save_Y, 1 ) );
// Unshuffle & save state
TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA
TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG
STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
casti_m128i( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_m128i( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_m128i( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
casti_v128u32( out_X, 0 ) = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
casti_v128u32( out_Y, 0 ) = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
casti_v128u32( out_X, 1 ) = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF
casti_v128u32( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
}
#endif // SHA
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
#define sha256_neon_rounds( state_out, input, state_in ) \
{ \
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32x4_t TMP0, TMP1, TMP2; \
\
STATE0 = vld1q_u32( state_in ); \
STATE1 = vld1q_u32( state_in+4 ); \
ABEF_SAVE = STATE0; \
CDGH_SAVE = STATE1; \
ABCD_SAVE = STATE0; \
EFGH_SAVE = STATE1; \
\
MSG0 = load_msg( input, 0 ); \
MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
TMP2 = STATE0; \
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
vst1q_u32( state_out , STATE0 ); \
vst1q_u32( state_out+4, STATE1 ); \
}
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
input_Y, state_in_X, state_in_Y ) \
{ \
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vld1q_u32( state_in_Y ); \
STATE1_X = vld1q_u32( state_in_X+4 ); \
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
ABEF_SAVE_X = STATE0_X; \
ABEF_SAVE_Y = STATE0_Y; \
CDGH_SAVE_X = STATE1_X; \
CDGH_SAVE_Y = STATE1_Y; \
ABCD_SAVE_X = STATE0_X; \
ABCD_SAVE_Y = STATE0_Y; \
EFGH_SAVE_X = STATE1_X; \
EFGH_SAVE_Y = STATE1_Y; \
\
MSG0_X = load_msg( input_X, 0 ); \
MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
vst1q_u32( state_out_X , STATE0_X ); \
vst1q_u32( state_out_Y , STATE0_Y ); \
vst1q_u32( state_out_X+4, STATE1_X ); \

View File

@@ -5,27 +5,21 @@
#include "simd-utils.h"
#include "cpuminer-config.h"
// generic interface
static const uint32_t SHA256_IV[8];
#if defined(__x86_64__) && defined(__SHA__)
typedef struct
{
unsigned char buf[64]; /* first field, for alignment */
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
static const uint32_t SHA256_IV[8];
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__x86_64__) && defined(__SHA__)
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
@@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_opt_transform_le sha256_x86_sha_transform_le
#define sha256_opt_transform_be sha256_x86_sha_transform_be
#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le
#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be
#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds
#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_x86_sha_transform_le
#define sha256_transform_be sha256_x86_sha_transform_be
@@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
// SHA-256 AArch64 with NEON & SHA2
typedef struct
{
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
@@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
@@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
#else
// without HW acceleration...
#include "sph_sha2.h"
#define sha256_context sph_sha256_context
#define sha256_full sph_sha256_full
#define sha256_ctx_init sph_sha256_init
#define sha256_update sph_sha256
@@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_transform_be sph_sha256_transform_be
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way
// SHA-256 16 way x86_64
typedef struct
{
@@ -147,7 +140,7 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_init sha256_16x32_init
#define sha256_16way_update sha256_16x32_update
#define sha256_16way_close sha256_16x32_close
@@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
#if defined (__AVX2__)
// SHA-256 8 way
// SHA-256 8 way x86_64
typedef struct
{
@@ -201,7 +194,7 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
#endif // AVX2
// SHA-256 4 way
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
{

Some files were not shown because too many files have changed in this diff Show More