Compare commits

...

6 Commits

Author SHA1 Message Date
Jay D Dee
c47c4a8885 v24.3 2024-05-28 18:20:19 -04:00
Jay D Dee
042d13d1e1 v24.2 2024-05-20 23:08:50 -04:00
Jay D Dee
4f930574cc v24.1 2024-04-16 21:31:35 -04:00
Jay D Dee
9d3a46c355 v23.15 2023-11-30 14:36:47 -05:00
Jay D Dee
4e3f1b926f v23.14 2023-11-28 00:58:43 -05:00
Jay D Dee
045b42babf v23.13 2023-11-21 14:18:15 -05:00
194 changed files with 5714 additions and 4587 deletions

View File

@@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
cpu-miner.c \
util.c \
api.c \
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m/m7m.c \
algo/m7m/magimath.cpp \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -250,6 +250,7 @@ cpuminer_SOURCES = \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x20r.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
algo/x16/minotaur.c \
@@ -288,7 +289,7 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

View File

@@ -87,7 +87,6 @@ Supported Algorithms
groestl Groestl coin
hex x16r-hex
hmq1725
hodl Hodlcoin
jha Jackpotcoin
keccak Maxcoin
keccakc Creative coin
@@ -115,9 +114,11 @@ Supported Algorithms
scrypt:N scrypt(N, 1, 1)
scryptn2 scrypt(1048576, 1, 1)
sha256d Double SHA-256
sha256dt
sha256q Quad SHA-256
sha256t Triple SHA-256
sha3d Double keccak256 (BSHA3)
sha512256d
skein Skein+Sha (Skeincoin)
skein2 Double Skein (Woodcoin)
skunk Signatum (SIGT)
@@ -145,6 +146,7 @@ Supported Algorithms
x16rt-veil veil
x16s
x17
x20r
x21s
x22i
x25x

View File

@@ -75,6 +75,46 @@ If not what makes it happen or not happen?
Change Log
----------
v24.3
ARM: CPU feature detection and reporting is now working.
ARM: Verthash is now working.
ARM: Small speedup for yescrypt, yespower & argon2d.
Code cleanup.
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.
v24.1
#414: fix bug in merkle error handling.
#416: change $nproc to $(nproc) in build scripts.
#420: change some inline function definitions to static inline.
#413: Fix formatting error for share result log when using no-color.
Faster 2 way interleaving.
Cleanup sha256 architecture targetting.
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
ARM: Fugue AES optimizations enabled.
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
v23.14
ARM: Groestl AES optimizations enabled.
All: Small optimization to Shabal 4way.
x86_64: Extend Shabal 4way support to SSE2 from SSE4.1.
All: deleted some unused files.
v23.13
Added x20r algo.
Eliminated redundant hash order calculations for x16r family.
v23.12
Several bugs fixes and speed improvements for x16r family for all CPU architectures.

View File

@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )
@@ -368,6 +368,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X16RT_VEIL: rc = register_x16rt_veil_algo ( gate ); break;
case ALGO_X16S: rc = register_x16s_algo ( gate ); break;
case ALGO_X17: rc = register_x17_algo ( gate ); break;
case ALGO_X20R: rc = register_x20r_algo ( gate ); break;
case ALGO_X21S: rc = register_x21s_algo ( gate ); break;
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;

View File

@@ -98,25 +98,27 @@ typedef uint32_t set_t;
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
// return set contained common elements from sets a & b
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
// all elements in set a are included in set b
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
// no elements in set a are included in set b
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{
@@ -246,7 +248,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/
#if defined(__AVX512F__)
#if defined(SIMD512)
static inline __m512i blamka( __m512i x, __m512i y )
{
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];

View File

@@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(SIMD512)
#if !defined(__AVX2__)

View File

@@ -39,7 +39,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
blakehash_4way( hash, vdata );

View File

@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////////
//
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//Blake-256 16 way AVX512

View File

@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////
//

View File

@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define B2B8W_G(a, b, c, d, x, y) \
{ \
@@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] =
v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \
}
static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last )
{
__m512i v[16], m[16];
@@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] );
}
int blake2b_8way_init( blake2b_8way_ctx *ctx )
int blake2b_8x64_init( blake2b_8x64_ctx *ctx )
{
size_t i;
@@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
}
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen )
{
__m512i* in =(__m512i*)input;
@@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_8way_compress( ctx, 0 );
blake2b_8x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
}
}
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_8way_compress( ctx, 1 ); // final block flag = 1
blake2b_8x64_compress( ctx, 1 ); // final block flag = 1
casti_m512i( out, 0 ) = ctx->h[0];
casti_m512i( out, 1 ) = ctx->h[1];
@@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = {
};
*/
static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last )
{
__m256i v[16], m[16];
@@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
}
int blake2b_4way_init( blake2b_4way_ctx *ctx )
int blake2b_4x64_init( blake2b_4x64_ctx *ctx )
{
size_t i;
@@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
return 0;
}
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen )
{
__m256i* in =(__m256i*)input;
@@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
ctx->t[0] += ctx->c;
if ( ctx->t[0] < ctx->c )
ctx->t[1]++;
blake2b_4way_compress( ctx, 0 );
blake2b_4x64_compress( ctx, 0 );
ctx->c = 0;
}
ctx->b[ c++ ] = in[i];
@@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
}
}
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out )
{
size_t c;
c = ctx->c >> 3;
@@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
ctx->c += 8;
}
blake2b_4way_compress( ctx, 1 ); // final block flag = 1
blake2b_4x64_compress( ctx, 1 ); // final block flag = 1
casti_m256i( out, 0 ) = ctx->h[0];
casti_m256i( out, 1 ) = ctx->h[1];

View File

@@ -1,6 +1,6 @@
#pragma once
#ifndef __BLAKE2B_HASH_4WAY_H__
#define __BLAKE2B_HASH_4WAY_H__
#ifndef BLAKE2B_HASH_4WAY_H__
#define BLAKE2B_HASH_4WAY_H__
#include "simd-utils.h"
#include <stddef.h>
@@ -15,7 +15,7 @@
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer
@@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_8way_ctx;
} blake2b_8x64_ctx;
int blake2b_8way_init( blake2b_8way_ctx *ctx );
void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input,
int blake2b_8x64_init( blake2b_8x64_ctx *ctx );
void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out );
void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );
#define blake2b_8way_ctx blake2b_8x64_ctx
#define blake2b_8way_init blake2b_8x64_init
#define blake2b_8way_update blake2b_8x64_update
#define blake2b_8way_final blake2b_8x64_final
#endif
@@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) {
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_4way_ctx;
} blake2b_4x64_ctx;
int blake2b_4way_init( blake2b_4way_ctx *ctx );
void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
int blake2b_4x64_init( blake2b_4x64_ctx *ctx );
void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
size_t inlen );
void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );
#define blake2b_4way_ctx blake2b_4x64_ctx
#define blake2b_4way_init blake2b_4x64_init
#define blake2b_4way_update blake2b_4x64_update
#define blake2b_4way_final blake2b_4x64_final
#endif

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY

View File

@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Blake2s-256 16 way

View File

@@ -11,8 +11,8 @@
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#ifndef BLAKE2S_HASH_4WAY_H__
#define BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__) || defined(__ARM_NEON)
@@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;

View File

@@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY

View File

@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}
#define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}
#define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );
// Round 1
// G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );
// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );
// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );
// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);

View File

@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////
//

View File

@@ -182,7 +182,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY

View File

@@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \

View File

@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 64 bit 8 way
typedef struct

View File

@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1

View File

@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 8 WAY

View File

@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// 4 way 128 is handy to avoid reinterleaving in many algos.
// If reinterleaving is necessary it may be more efficient to use

View File

@@ -6,7 +6,7 @@
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
struct _cube_4way_context
{

View File

@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
int r;
const int rounds = sp->rounds;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
register __m512i x0, x1;

View File

@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
};
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -15,237 +15,176 @@
*
*/
#if defined(__AES__)
#include <x86intrin.h>
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#include <memory.h>
#include "fugue-aesni.h"
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
{ 0x0202010807020100, 0x0a05000f06010c0b };
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
{ 0x0402060c070d0003, 0x090a060580808080 };
MYALIGN const unsigned int _IV512[] = {
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
{ 0x0706050403020000, 0x0302000007060504 };
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
{ 0x010c0b060d080702, 0x0904030e03000104 };
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
{ 0x8080808080808080, 0x0504070605040f06 };
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
{ 0x000000001b1b0000, 0x0000000000000000 };
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
{ 0x000000002d361b00, 0x0000000000000000 };
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
{ 0x0303030303030303, 0x0303030303030303 };
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
};
#if defined(__SSE4_1__)
#if defined(__ARM_NEON)
#define PACK_S0(s0, s1, t1)\
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
#define mask_1000(v) v128_put32( v, 0, 3 )
#define UNPACK_S0(s0, s1, t1)\
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
s0 = mm128_mask_32( s0, 8 )
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = s1;\
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1);
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
#else // SSE2
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
#define PACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
s0 = _mm_xor_si128(s0, t1);
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
#define UNPACK_S0(s0, s1, t1)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
s0 = mm128_mask_32( s0, 8 )
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
#define CMIX(s1, s2, r1, r2, t1, t2)\
t1 = _mm_shuffle_epi32(s1, 0xf9);\
t2 = _mm_shuffle_epi32(s2, 0xcf);\
t1 = _mm_xor_si128(t1, t2);\
r1 = _mm_xor_si128(r1, t1);\
r2 = _mm_xor_si128(r2, t1)
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
t1 = v128_xor( t1, t2 ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#elif defined(__SSE4_1__)
#define mask_1000(v) v128_mask32( v, 8 )
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
t1 = s1; \
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
r1 = v128_xor( r1, t1 ); \
r2 = v128_xor( r2, t1 );
#endif
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s10 = _mm_xor_si128(s10, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1)
#define PACK_S0( s0, s1, t1 ) \
s0 = v128_movlane32( s0, 3, s1, 0 )
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s16 = _mm_xor_si128(s16, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1)
#define UNPACK_S0( s0, s1, t1 ) \
s1 = v128_movlane32( s1, 0, s0, 3 ); \
s0 = mask_1000( s0 )
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
s22 = _mm_xor_si128(s22, t1);\
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
t1 = _mm_slli_si128(t1, 8);\
s8 = _mm_xor_si128(s8, t1);\
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
s0 = _mm_xor_si128(s0, t1);\
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
s4 = _mm_xor_si128(s4, t1);\
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
s7 = _mm_xor_si128(s7, t1)
t1 = shuffle_3303( s0 ); \
s22 = v128_xor(s22, t1);\
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
s0 = v128_movlane32( s0, 0, t1, 0 ); \
t1 = v128_alignr64( t1, v128_zero, 1 ); \
s8 = v128_xor(s8, t1);\
t1 = shuffle_3303( s24 ); \
s0 = v128_xor(s0, t1);\
t1 = shuffle_3303( s27 ); \
s4 = v128_xor(s4, t1);\
t1 = shuffle_3303( s30 ); \
s7 = v128_xor(s7, t1)
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\
s1 = x;\
s2 = _mm_add_epi8(x, x);\
t2 = _mm_add_epi8(s2, s2);\
t1 = _mm_srli_epi16(x, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
*/
#define SUBSTITUTE(r0, _t2 )\
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
#define SUBSTITUTE( r0, _t2 ) \
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
_t2 = v128_aesenclast_nokey( _t2 )
#define SUPERMIX(t0, t1, t2, t3, t4)\
t2 = t0;\
t3 = _mm_add_epi8(t0, t0);\
t4 = _mm_add_epi8(t3, t3);\
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t3 = v128_add8( t0, t0 ); \
t4 = v128_add8( t3, t3 ); \
t1 = v128_sr16( t0, 6 ); \
t1 = v128_and( t1, _lsbmask2 ); \
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
t4 = v128_shuffle8( t2, _supermix1b ); \
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
t1 = v128_shuffle8( t4, _supermix1c ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t4, _supermix1d ); \
t4 = v128_xor( t4, t1 ); \
t1 = v128_shuffle8( t2, _supermix1a ); \
t2 = v128_xor3( t2, t3, t0 ); \
t2 = v128_shuffle8( t2, _supermix7a ); \
t4 = v128_xor3( t4, t1, t2 ); \
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t2 = v128_shuffle8( t2, _supermix7b ); \
t3 = v128_shuffle8( t3, _supermix2a ); \
t1 = v128_shuffle8( t0, _supermix4a ); \
t0 = v128_shuffle8( t0, _supermix4b ); \
t4 = v128_xor3( t4, t2, t1 ); \
t0 = _mm_xor_si128(t0, t3);\
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
/*
#define SUPERMIX(t0, t1, t2, t3, t4)\
PRESUPERMIX(t0, t1, t2, t3, t4);\
POSTSUPERMIX(t0, t1, t2, t3, t4)
*/
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
t4 = t1;\
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
t4 = _mm_xor_si128(t4, t1);\
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
t4 = _mm_xor_si128(t4, t1);\
t2 = v128_xor3(t2, t3, t0 );\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
t4 = _mm_xor_si128(t4, t2);\
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
t4 = _mm_xor_si128(t4, t1);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
t0 = _mm_xor_si128(t0, t3);\
t4 = _mm_xor_si128(t4, t0);\
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
t4 = _mm_xor_si128(t4, t0)
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE(r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE(r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
UNPACK_S0(r3c, r3a, _t3)
t0 = v128_xor( t0, t3 ); \
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
PACK_S0(r1c, r1a, _t0);\
SUBSTITUTE( r1c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
r2c = _mm_xor_si128(r2c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r2d = _mm_xor_si128(r2d, _t0);\
_t0 = shuffle_0321( r1c ); \
r2c = v128_xor(r2c, _t0);\
_t0 = mask_1000( _t0 ); \
r2d = v128_xor(r2d, _t0);\
UNPACK_S0(r1c, r1a, _t3);\
SUBSTITUTE(r2c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
r3c = _mm_xor_si128(r3c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r3d = _mm_xor_si128(r3d, _t0);\
_t0 = shuffle_0321( r2c ); \
r3c = v128_xor(r3c, _t0);\
_t0 = mask_1000( _t0 ); \
r3d = v128_xor(r3d, _t0);\
UNPACK_S0(r2c, r2a, _t3);\
SUBSTITUTE( r3c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
r4c = _mm_xor_si128(r4c, _t0);\
_t0 = mm128_mask_32( _t0, 8 ); \
r4d = _mm_xor_si128(r4d, _t0);\
_t0 = shuffle_0321( r3c ); \
r4c = v128_xor(r4c, _t0);\
_t0 = mask_1000( _t0 ); \
r4d = v128_xor(r4d, _t0);\
UNPACK_S0(r3c, r3a, _t3);\
SUBSTITUTE( r4c, _t2 );\
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
block[1] = col[(base + a + 1) % s];\
block[2] = col[(base + a + 2) % s];\
block[3] = col[(base + a + 3) % s];\
x = _mm_load_si128((__m128i*)block)
x = v128_load( (v128_t*)block )
#define STORECOLUMN(x, s)\
_mm_store_si128((__m128i*)block, x);\
v128_store((v128_t*)block, x );\
col[(base + 0) % s] = block[0];\
col[(base + 1) % s] = block[1];\
col[(base + 2) % s] = block[2];\
col[(base + 3) % s] = block[3]
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
unsigned int uBlockCount )
{
__m128i _t0, _t1, _t2, _t3;
v128_t _t0, _t1, _t2, _t3;
switch(ctx->base)
{
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
pmsg += 4;
uBlockCount--;
}
}
void Final512(hashState_fugue *ctx, BitSequence *hashval)
void Final512( hashState_fugue *ctx, uint8_t *hashval )
{
unsigned int block[4] __attribute__ ((aligned (32)));
unsigned int col[36] __attribute__ ((aligned (16)));
unsigned int i, base;
__m128i r0, _t0, _t1, _t2, _t3;
v128_t r0, _t0, _t1, _t2, _t3;
for(i = 0; i < 12; i++)
for( i = 0; i < 12; i++ )
{
_mm_store_si128((__m128i*)block, ctx->state[i]);
v128_store( (v128_t*)block, ctx->state[i] );
col[3 * i + 0] = block[0];
col[3 * i + 1] = block[1];
col[3 * i + 2] = block[2];
}
base = (36 - (12 * ctx->base)) % 36;
base = ( 36 - (12 * ctx->base) ) % 36;
for(i = 0; i < 32; i++)
for( i = 0; i < 32; i++ )
{
// ROR3
base = (base + 33) % 36;
// CMIX
col[(base + 0) % 36] ^= col[(base + 4) % 36];
col[(base + 1) % 36] ^= col[(base + 5) % 36];
col[(base + 2) % 36] ^= col[(base + 6) % 36];
col[(base + 18) % 36] ^= col[(base + 4) % 36];
col[(base + 19) % 36] ^= col[(base + 5) % 36];
col[(base + 20) % 36] ^= col[(base + 6) % 36];
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
for(i = 0; i < 13; i++)
for( i = 0; i < 13; i++ )
{
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR9
base = (base + 27) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 10) % 36] ^= col[(base + 0) % 36];
col[(base + 19) % 36] ^= col[(base + 0) % 36];
col[(base + 28) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
// ROR8
base = (base + 28) % 36;
// SMIX
LOADCOLUMN(r0, 36, 0);
SUBSTITUTE(r0, _t2);
SUPERMIX(_t2, _t3, _t0, _t1, r0);
STORECOLUMN(r0, 36);
LOADCOLUMN( r0, 36, 0 );
SUBSTITUTE( r0, _t2 );
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
STORECOLUMN( r0, 36 );
}
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
col[(base + 4) % 36] ^= col[(base + 0) % 36];
col[(base + 9) % 36] ^= col[(base + 0) % 36];
col[(base + 18) % 36] ^= col[(base + 0) % 36];
col[(base + 27) % 36] ^= col[(base + 0) % 36];
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
LOADCOLUMN(r0, 36, 1);
_mm_store_si128((__m128i*)hashval, r0);
LOADCOLUMN( r0, 36, 1 );
v128_store( (v128_t*)hashval, r0 );
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
LOADCOLUMN(r0, 36, 9);
_mm_store_si128((__m128i*)hashval + 1, r0);
LOADCOLUMN( r0, 36, 9 );
v128_store( (v128_t*)hashval + 1, r0 );
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
LOADCOLUMN(r0, 36, 18);
_mm_store_si128((__m128i*)hashval + 2, r0);
LOADCOLUMN( r0, 36, 18 );
v128_store( (v128_t*)hashval + 2, r0 );
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
LOADCOLUMN(r0, 36, 27);
_mm_store_si128((__m128i*)hashval + 3, r0);
LOADCOLUMN( r0, 36, 27 );
v128_store( (v128_t*)hashval + 3, r0 );
}
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
{
int i;
ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
for(i = 0; i < 6; i++)
ctx->state[i] = v128_zero;
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
ctx->state[6] = casti_v128( _IV512, 0 );
ctx->state[7] = casti_v128( _IV512, 1 );
ctx->state[8] = casti_v128( _IV512, 2 );
ctx->state[9] = casti_v128( _IV512, 3 );
ctx->state[10] = casti_v128( _IV512, 4 );
ctx->state[11] = casti_v128( _IV512, 5 );
return SUCCESS;
return 0;
}
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen )
{
unsigned int uByteLength, uBlockCount, uRemainingBytes;
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
if(state->uBufferBytes != 0)
{
// Fill the buffer
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
memcpy( state->buffer + state->uBufferBytes, (void*)data,
state->uBlockLength - state->uBufferBytes );
// Process the buffer
Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
state->uBufferBytes += uByteLength;
}
return SUCCESS;
return 0;
}
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
int fugue512_Final( hashState_fugue *state, void *hashval )
{
unsigned int i;
BitSequence lengthbuf[8] __attribute__((aligned(64)));
uint8_t lengthbuf[8] __attribute__((aligned(64)));
// Update message bit count
state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
// Finalization
Final512(state, hashval);
return SUCCESS;
return 0;
}
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen )
{
fugue512_Init(hs, 512);
fugue512_Update(hs, data, databitlen*8);
fugue512_Final(hs, hashval);
return SUCCESS;
fugue512_Init( hs, 512 );
fugue512_Update( hs, data, databitlen*8 );
fugue512_Final( hs, hashval );
return 0;
}
#endif // AES

View File

@@ -14,37 +14,31 @@
#ifndef FUGUE_HASH_API_H
#define FUGUE_HASH_API_H
#if defined(__AES__)
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#if !defined(__SSE4_1__)
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "compat/sha3_common.h"
#include "simd-utils.h"
typedef struct
{
__m128i state[12];
v128_t state[12];
unsigned int base;
unsigned int uHashSize;
unsigned int uBlockLength;
unsigned int uBufferBytes;
DataLength processed_bits;
BitSequence buffer[4];
uint64_t processed_bits;
uint8_t buffer[4];
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
int fugue512_Init( hashState_fugue *state, int hashbitlen );
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
int fugue512_Update( hashState_fugue *state, const void *data,
uint64_t databitlen );
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
int fugue512_Final( hashState_fugue *state, void *hashval );
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
uint64_t databitlen);
#endif // AES
#endif // HASH_API_H

View File

@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
static void AddXor512(const void *a,const void *b,void *c)
{
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
casti_m512i( b, 0 ) );
#elif defined(__AVX2__)
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
casti_m256i( b, 0 ) );
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
casti_m256i( b, 1 ) );
#elif defined(__SSE2__)
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
casti_m128i( b, 0 ) );
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
casti_m128i( b, 1 ) );
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
casti_m128i( b, 2 ) );
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
casti_m128i( b, 3 ) );
#elif defined(__SSE2__) || defined(__ARM_NEON)
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
casti_v128( b, 0 ) );
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
casti_v128( b, 1 ) );
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
casti_v128( b, 2 ) );
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
casti_v128( b, 3 ) );
#else
const unsigned long long *A=a, *B=b;
unsigned long long *C=c;

View File

@@ -60,54 +60,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
//static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
/*
#define TRANSP_MASK \
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
#define SUBSH_MASK0 \
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
#define SUBSH_MASK1 \
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
#define SUBSH_MASK2 \
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
#define SUBSH_MASK3 \
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
#define SUBSH_MASK4 \
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
#define SUBSH_MASK5 \
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
#define SUBSH_MASK6 \
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
#define SUBSH_MASK7 \
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
c07, c06, c05, c04, c03, c02, c01, c00 ) \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
*/
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -140,7 +103,7 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -334,17 +297,16 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = v128_xor(b0, b0);\
a0 = v128_aesenclast(a0, b0);\
a1 = v128_aesenclast(a1, b0);\
a2 = v128_aesenclast(a2, b0);\
a3 = v128_aesenclast(a3, b0);\
a4 = v128_aesenclast(a4, b0);\
a5 = v128_aesenclast(a5, b0);\
a6 = v128_aesenclast(a6, b0);\
a7 = v128_aesenclast(a7, b0);\
a0 = v128_aesenclast_nokey( a0 ); \
a1 = v128_aesenclast_nokey( a1 ); \
a2 = v128_aesenclast_nokey( a2 ); \
a3 = v128_aesenclast_nokey( a3 ); \
a4 = v128_aesenclast_nokey( a4 ); \
a5 = v128_aesenclast_nokey( a5 ); \
a6 = v128_aesenclast_nokey( a6 ); \
a7 = v128_aesenclast_nokey( a7 ); \
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7 ); \
}
#define ROUNDS_P(){\
@@ -362,10 +324,9 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
/* SubBytes + MixBytes */\
/* SubBytes + MixBytes */\
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
\
/* AddRoundConstant P1024 */\
xmm0 = v128_xor( xmm0, \
casti_v128( round_const_p, round_counter+1 ) ); \
@@ -467,7 +428,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
t1 = v128_unpackhi16(t1, i3);\
i2 = v128_unpacklo16(i2, i3);\
i0 = v128_unpacklo16(i0, i1);\
\
/* shuffle with immediate */\
t0 = gr_shuffle32( t0 ); \
t1 = gr_shuffle32( t1 ); \
@@ -477,7 +437,6 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
i2 = gr_shuffle32( i2 ); \
i4 = gr_shuffle32( i4 ); \
i6 = gr_shuffle32( i6 ); \
\
/* continue with unpack */\
t4 = i0;\
i0 = v128_unpacklo32(i0, i2);\
@@ -584,7 +543,8 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
/* transpose done */\
}/**/
#if 0
// not used
void INIT( v128_t* chaining )
{
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -613,6 +573,7 @@ void INIT( v128_t* chaining )
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
void TF1024( v128_t* chaining, const v128_t* message )
{

View File

@@ -1,3 +1,6 @@
#if !defined GROESTL256_INTR_AES_H__
#define GROESTL256_INTR_AES_H__
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -50,18 +53,17 @@ static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
#if defined(__ARM_NEON)
// No fast shuffle on NEON
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
{ 0x03020100, 0x0b0a0908, 0x07060504, 0x0f0e0d0c };
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
#define gr_shuffle32(v) vqtbl1q_u8( v, gr_mask )
#else
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
#define gr_shuffle32(v) _mm_shuffle_epi32( v, 0xd8 )
#endif
#define tos(a) #a
#define tostr(a) tos(a)
@@ -93,7 +95,7 @@ static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
@@ -598,4 +600,4 @@ void OF512( v128_t* chaining )
chaining[3] = xmm11;
}
#endif

View File

@@ -146,7 +146,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
const int hash_offset = SIZE512 - hashlen_m128i;
uint64_t blocks = len / SIZE512;
v128_t* in = (v128_t*)input;
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -181,6 +181,7 @@ int groestl512( hashState_groestl* ctx, void* output, const void* input,
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output

View File

@@ -87,6 +87,7 @@ int final_groestl( hashState_groestl*, void* );
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
#define groestl512_full groestl512
#define groestl512_ctx groestl512
#endif /* __hash_h */

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define GROESTL_4WAY_VAES 1
#endif

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

View File

@@ -43,7 +43,7 @@
#define SIZE256 (SIZE_512/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];

View File

@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
b1 = mm256_bcast_m128( v128_mask32( v128_neg1, 0x3 ) ); \
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
a1 = _mm256_xor_si256( a1, b1 );\
a2 = _mm256_xor_si256( a2, b1 );\

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{

View File

@@ -33,7 +33,7 @@
#define SIZE512 (SIZE_1024/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];

View File

@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,

View File

@@ -213,7 +213,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3,n+2,n+1,n ) );
myriad_4way_hash( hash, vdata );
pdata[19] = n;

View File

@@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate )
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT;
#endif
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define MYRGR_8WAY 1
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY 1

View File

@@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi 8 way AVX512
@@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_BIG \
do { \
@@ -1501,7 +1501,7 @@ do { /* order is important */ \
sc->h[14] = CE; \
sc->h[15] = CF;
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_8X32 \
{ \

View File

@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi-512 8x64

View File

@@ -53,7 +53,7 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
#if defined(VL256)
// ( ~( a ^ b ) ) & c
#define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
// Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
#if defined (VL256)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \

View File

@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[32];

View File

@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
(state)->H[15] = h7l; \
} while (0)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define Sb_8W(x0, x1, x2, x3, c) \
{ \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
#if defined(__AVX2__)
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#if defined(VL256)
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void jh256_8x64_init( jh_8x64_context *sc )
{

View File

@@ -55,7 +55,7 @@
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
@@ -12,7 +12,7 @@
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1

View File

@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
#define DO(x) x
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define INPUT_BUF(size) do { \
size_t j; \

View File

@@ -4,7 +4,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \

View File

@@ -51,7 +51,7 @@
#define LIMIT_512 128
/*********************************/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
uint32_t buffer[8*4];

View File

@@ -28,8 +28,7 @@
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -69,8 +68,7 @@
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___
#endif // LUFFA_FOR_SSE2_H__

View File

@@ -15,7 +15,7 @@
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1
@@ -465,12 +465,8 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
//#if defined(__x86_64__)
skein256_2x64_context skein;
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -516,7 +512,6 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,23 +522,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
sph_skein256_close( &ctx.skein, hash0 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash1, 32 );
sph_skein256_close( &ctx.skein, hash1 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash2, 32 );
sph_skein256_close( &ctx.skein, hash2 );
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV2_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
/////////////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define PHI2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define PHI2_4WAY 1

View File

@@ -41,7 +41,7 @@
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords

View File

@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );

View File

@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
lyra2h_4way_midstate( vdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )

View File

@@ -456,7 +456,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
*noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "algo/blake/blake256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "lyra2.h"
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#include "algo/echo/echo-hash-4way.h"
#elif defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"

View File

@@ -27,7 +27,7 @@
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{

View File

@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 24 ); \
b = v128_ror64xor( b, c, 24 ); \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 63 );
b = v128_ror64xor( b, c, 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
@@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G( r, i, a, b, c, d ) \
{ \
a = a + b; \
@@ -222,7 +218,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
union _ovly_512
{

View File

@@ -21,7 +21,7 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n( double xt )
static inline double exp_n( double xt )
{
if ( xt < -700.0 )
return 0;
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
return exp( xt );
}
inline double exp_n2( double x1, double x2 )
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;
@@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate )
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA_OPT;
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
gate->build_stratum_request = (void*)&std_be_build_stratum_request;

View File

@@ -1,75 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <iostream>
#include <cfloat>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "magimath.h"
#define EPS1 (std::numeric_limits<double>::epsilon())
#define EPS2 3.0e-11
static void gauleg(double x1, double x2, double x[], double w[], const int n)
{
int m,j,i;
double z1, z, xm, xl, pp, p3, p2, p1;
m=(n+1)/2;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for (i=1;i<=m;i++) {
z=cos(3.141592654*(i-0.25)/(n+0.5));
do {
p1=1.0;
p2=0.0;
for (j=1;j<=n;j++) {
p3=p2;
p2=p1;
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
}
pp=n*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > EPS2);
x[i]=xm-xl*z;
x[n+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[n+1-i]=w[i];
}
}
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
{
double s=0.0;
#ifdef _MSC_VER
#define SW_DIVS 23
double x[SW_DIVS+1], w[SW_DIVS+1];
#else
double x[NptGQ+1], w[NptGQ+1];
#endif
gauleg(a2, b2, x, w, NptGQ);
for (int j=1; j<=NptGQ; j++) {
s += w[j]*func(x[j]);
}
return s;
}
static double swit_(double wvnmb)
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
}
uint32_t sw_(int nnounce, int divs)
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
}

View File

@@ -1,54 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef MAGI_MATH_H
#define MAGI_MATH_H
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t sw_(int nnounce, int divs);
#ifdef __cplusplus
}
#endif
inline double exp_n(double xt)
{
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
if(xt < p1)
return 0;
else if(xt > p6)
return 1e200;
else if(xt > p3 && xt < p4)
return (1.0 + xt);
else
return exp(xt);
}
// 1 / (1 + exp(x1-x2))
inline double exp_n2(double x1, double x2)
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
return ( exp(-xt) );
else //if (xt > p6 - 1.e-200)
return 0.;
}
#endif

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define NIST5_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY 1

View File

@@ -71,8 +71,7 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
@@ -312,7 +311,7 @@ do { \
BUPDATE1_8W( 7, 1 ); \
} while (0)
#if defined(__AVX512VL__)
#if defined(VL256)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ANIME_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1

View File

@@ -6,16 +6,16 @@
#include <stdint.h>
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/fugue/fugue-aesni.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/jh/sph_jh.h"
@@ -33,18 +33,18 @@
union _hmq1725_ctx_holder
{
blake512_context blake;
blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_fugue fugue;
#else
sph_groestl512_context groestl;
sph_fugue512_context fugue;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_skein512_context skein;
@@ -62,9 +62,6 @@ union _hmq1725_ctx_holder
};
typedef union _hmq1725_ctx_holder hmq1725_ctx_holder;
//static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
//static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
@@ -82,7 +79,7 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashB[0] & mask ) //1
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashA, hashB, 512 );
#else
sph_groestl512_init( &ctx.groestl );
@@ -180,7 +177,7 @@ extern void hmq1725hash(void *state, const void *input)
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512_init( &ctx.fugue );
@@ -211,7 +208,7 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashB[0] & mask ) //7
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
#else
sph_fugue512_init( &ctx.fugue );
@@ -226,7 +223,7 @@ extern void hmq1725hash(void *state, const void *input)
sph_sha512_close( &ctx.sha, hashA );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hashB, hashA, 512 );
#else
sph_groestl512_init( &ctx.groestl );
@@ -262,30 +259,18 @@ extern void hmq1725hash(void *state, const void *input)
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
// uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(32)));
uint32_t hash64[8] __attribute__((aligned(32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...
// for (int k = 0; k < 32; k++)
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
// hmq_bmw512_midstate( endiandata );
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;

View File

@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUARK_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY 1

View File

@@ -7,12 +7,12 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake512-hash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
void quark_hash(void *state, const void *input)
{
uint32_t hash[16] __attribute__((aligned(64)));
sph_blake512_context ctx_blake;
blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_groestl ctx_groestl;
#else
sph_groestl512_context ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
sph_keccak512_context ctx_keccak;
uint32_t mask = 8;
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, input, 80 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, input, 80 );
sph_bmw512_init( &ctx_bmw );
sph_bmw512( &ctx_bmw, hash, 64 );
sph_bmw512_close( &ctx_bmw, hash );
if ( hash[0] & mask )
{
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
sph_skein512_close( &ctx_skein, hash );
}
#if defined(__AES__)
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash,
(const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
if ( hash[0] & mask )
{
sph_blake512_init( &ctx_blake );
sph_blake512( &ctx_blake, hash, 64 );
sph_blake512_close( &ctx_blake, hash );
blake512_full( &ctx_blake, hash, hash, 64 );
}
else
{

View File

@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );

View File

@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_qubit;
gate->hash = (void*)&qubit_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
| NEON_OPT;
return true;
};

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUBIT_4WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY 1

View File

@@ -8,13 +8,9 @@
#include <stdio.h>
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
#ifdef __AES__
simd512_context simd;
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
init_luffa(&qubit_ctx.luffa,512);
cubehashInit(&qubit_ctx.cubehash,512,16,32);
sph_shavite512_init(&qubit_ctx.shavite);
#if defined(__aarch64__)
sph_simd512_init( &qubit_ctx.simd );
#else
init_sd( &qubit_ctx.simd, 512 );
#endif
#ifdef __AES__
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
init_echo(&qubit_ctx.echo, 512);
#else
sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
#ifdef __AES__
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512 );
#else

View File

@@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
@@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id; // thr_id arg is deprecated
// we need bigendian data...
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) );
casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) );
casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) );
casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) );
intrlv_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 1024 );

View File

@@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
bool register_lbry_algo( algo_gate_t* gate )
{
// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
#if defined (LBRY_16WAY)
gate->scanhash = (void*)&scanhash_lbry_16way;
gate->hash = (void*)&lbry_16way_hash;
@@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate )
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT;
#endif
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LBRY_16WAY 1
#elif defined(__AVX2__)
#define LBRY_8WAY 1

View File

@@ -35,20 +35,20 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
_mm_xor_si128( v128_ornot( y, x ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
_mm_xor_si128( x, v128_ornot( z, y ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
c = v128_rol32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
#define F8W_3(x, y, z) \
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
_mm256_xor_si256( mm256_ornot( y, x ), z )
#define F8W_4(x, y, z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
#define F8W_5(x, y, z) \
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
_mm256_xor_si256( x, mm256_ornot( z, y ) )
#define RR_8W(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// RIPEMD-160 16 way

View File

@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -745,7 +745,7 @@ do{ \
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Tested OK but very slow
// 16 way parallel, requires 16x32 interleaving
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
v128_ovly v;
for ( int l = 0; l < 4; l++ )
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
X[i] = v128_xor( X[i], v.m128 );
X[i] = v128_xor( X[i], v.v128 );
}
xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
y[0].m128 = X0;
y[1].m128 = X1;
y[2].m128 = X2;
y[3].m128 = X3;
y[0].v128 = X0;
y[1].v128 = X1;
y[2].v128 = X2;
y[3].v128 = X3;
z[0].u32[0] = y[0].u32[0];
z[0].u32[3] = y[1].u32[0];
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
z[3].u32[1] = y[2].u32[3];
z[3].u32[0] = y[3].u32[3];
B[0] = v128_add32( B[0], z[0].m128 );
B[1] = v128_add32( B[1], z[1].m128 );
B[2] = v128_add32( B[2], z[2].m128 );
B[3] = v128_add32( B[3], z[3].m128 );
B[0] = v128_add32( B[0], z[0].v128 );
B[1] = v128_add32( B[1], z[1].v128 );
B[2] = v128_add32( B[2], z[2].v128 );
B[3] = v128_add32( B[3], z[3].v128 );
#endif
@@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
za[3].u32[3] = ya[0].u32[3];
zb[3].u32[3] = yb[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
*/
}
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_2BUF;
@@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
yb[0].m128 = XB[0];
yc[0].m128 = XC[0];
ya[1].m128 = XA[1];
yb[1].m128 = XB[1];
yc[1].m128 = XC[1];
ya[2].m128 = XA[2];
yb[2].m128 = XB[2];
yc[2].m128 = XC[2];
ya[3].m128 = XA[3];
yb[3].m128 = XB[3];
yc[3].m128 = XC[3];
ya[0].v128 = XA[0];
yb[0].v128 = XB[0];
yc[0].v128 = XC[0];
ya[1].v128 = XA[1];
yb[1].v128 = XB[1];
yc[1].v128 = XC[1];
ya[2].v128 = XA[2];
yb[2].v128 = XB[2];
yc[2].v128 = XC[2];
ya[3].v128 = XA[3];
yb[3].v128 = XB[3];
yc[3].v128 = XC[3];
za[0].u32[0] = ya[0].u32[0];
zb[0].u32[0] = yb[0].u32[0];
@@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
zb[3].u32[3] = yb[0].u32[3];
zc[3].u32[3] = yc[0].u32[3];
XA[0] = za[0].m128;
XB[0] = zb[0].m128;
XC[0] = zc[0].m128;
XA[1] = za[1].m128;
XB[1] = zb[1].m128;
XC[1] = zc[1].m128;
XA[2] = za[2].m128;
XB[2] = zb[2].m128;
XC[2] = zc[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;
XA[0] = za[0].v128;
XB[0] = zb[0].v128;
XC[0] = zc[0].v128;
XA[1] = za[1].v128;
XB[1] = zb[1].v128;
XC[1] = zc[1].v128;
XA[2] = za[2].v128;
XB[2] = zb[2].v128;
XC[2] = zc[2].v128;
XA[3] = za[3].v128;
XB[3] = zb[3].v128;
XC[3] = zc[3].v128;
*/
}
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_3BUF;
@@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
xf = (B[15] ^= C[15]);
#define ROL32( a, c ) ror32( a, c )
#define ROL32( a, c ) rol32( a, c )
#define ADD32( a, b ) ( (a)+(b) )
#define XOR( a, b ) ( (a)^(b) )

View File

@@ -5,7 +5,7 @@
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );

View File

@@ -35,7 +35,7 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
#endif /* HAVE_SHA256_8WAY */
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static inline void sha256_16way_init_state( void *state )
{
@@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf

View File

@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
}
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// HMAC 16-way AVX512

View File

@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct _hmac_sha256_16way_context
{

View File

@@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \

View File

@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#if defined(VL256)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
sha256_8way_close( &ctx, dst );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way

View File

@@ -1,6 +1,6 @@
#include "sha256-hash.h"
#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) )
static const uint32_t SHA256_IV[8] =
{
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
#if defined(__x86_64__) && defined(__SHA__)
/* common code used for rounds 12 through 51 */
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
TMP = _mm_alignr_epi8( a, c, 4 ); \
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
b = _mm_add_epi32( b, TMP ); \
b = _mm_sha256msg2_epu32( b, a ); \
m = _mm_shuffle_epi32( m, 0x0e ); \
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
c = _mm_sha256msg1_epu32( c, a );
// r12-15
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
// r16-19
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
// r20-23
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
// r24-27
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
#define sha256_opt_rounds( state_out, input, state_in ) \
{ \
__m128i STATE0, STATE1; \
@@ -189,7 +211,7 @@ static const uint32_t SHA256_IV[8] =
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
}
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
@@ -197,7 +219,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
#undef load_msg
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
@@ -517,7 +539,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input,
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
}
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -526,7 +548,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
#undef load_msg
}
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -541,7 +563,7 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
// The goal is to avoid any redundant processing in final. Prehash is almost
// 4 rounds total, only missing the final addition of the nonce.
// Nonce must be set to zero for prehash.
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate )
{
__m128i STATE0, STATE1, MSG, TMP;
@@ -569,7 +591,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
casti_m128i( ostate, 1 ) = STATE1;
}
void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y )
@@ -587,8 +609,8 @@ void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
// Add the nonces (msg[0] lane 3) to A & E (STATE0 lanes 1 & 3)
TMSG0_X = casti_m128i( msg_X, 0 );
TMSG0_Y = casti_m128i( msg_Y, 0 );
TMP_X = mm128_xim_32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = mm128_xim_32( TMSG0_Y, TMSG0_Y, 0xd5 );
TMP_X = v128_xim32( TMSG0_X, TMSG0_X, 0xd5 );
TMP_Y = v128_xim32( TMSG0_Y, TMSG0_Y, 0xd5 );
STATE0_X = _mm_add_epi32( STATE0_X, TMP_X );
STATE0_Y = _mm_add_epi32( STATE0_Y, TMP_Y );
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
#define sha256_neon_rounds( state_out, input, state_in ) \
{ \
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32x4_t TMP0, TMP1, TMP2; \
\
STATE0 = vld1q_u32( state_in ); \
STATE1 = vld1q_u32( state_in+4 ); \
ABEF_SAVE = STATE0; \
CDGH_SAVE = STATE1; \
ABCD_SAVE = STATE0; \
EFGH_SAVE = STATE1; \
\
MSG0 = load_msg( input, 0 ); \
MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
TMP2 = STATE0; \
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
vst1q_u32( state_out , STATE0 ); \
vst1q_u32( state_out+4, STATE1 ); \
}
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
input_Y, state_in_X, state_in_Y ) \
{ \
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vld1q_u32( state_in_Y ); \
STATE1_X = vld1q_u32( state_in_X+4 ); \
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
ABEF_SAVE_X = STATE0_X; \
ABEF_SAVE_Y = STATE0_Y; \
CDGH_SAVE_X = STATE1_X; \
CDGH_SAVE_Y = STATE1_Y; \
ABCD_SAVE_X = STATE0_X; \
ABCD_SAVE_Y = STATE0_Y; \
EFGH_SAVE_X = STATE1_X; \
EFGH_SAVE_Y = STATE1_Y; \
\
MSG0_X = load_msg( input_X, 0 ); \
MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
vst1q_u32( state_out_X , STATE0_X ); \
vst1q_u32( state_out_Y , STATE0_Y ); \
vst1q_u32( state_out_X+4, STATE1_X ); \

View File

@@ -5,27 +5,21 @@
#include "simd-utils.h"
#include "cpuminer-config.h"
// generic interface
static const uint32_t SHA256_IV[8];
#if defined(__x86_64__) && defined(__SHA__)
typedef struct
{
unsigned char buf[64]; /* first field, for alignment */
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
static const uint32_t SHA256_IV[8];
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__x86_64__) && defined(__SHA__)
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
@@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_opt_transform_le sha256_x86_sha_transform_le
#define sha256_opt_transform_be sha256_x86_sha_transform_be
#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le
#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be
#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds
#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_x86_sha_transform_le
#define sha256_transform_be sha256_x86_sha_transform_be
@@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
// SHA-256 AArch64 with NEON & SHA2
typedef struct
{
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
@@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
@@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
#else
// without HW acceleration...
#include "sph_sha2.h"
#define sha256_context sph_sha256_context
#define sha256_full sph_sha256_full
#define sha256_ctx_init sph_sha256_init
#define sha256_update sph_sha256
@@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_transform_be sph_sha256_transform_be
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way
// SHA-256 16 way x86_64
typedef struct
{
@@ -147,7 +140,7 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_init sha256_16x32_init
#define sha256_16way_update sha256_16x32_update
#define sha256_16way_close sha256_16x32_close
@@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
#if defined (__AVX2__)
// SHA-256 8 way
// SHA-256 8 way x86_64
typedef struct
{
@@ -201,7 +194,7 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
#endif // AVX2
// SHA-256 4 way
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
{

Some files were not shown because too many files have changed in this diff Show More