Compare commits

..

2 Commits

Author SHA1 Message Date
Jay D Dee
042d13d1e1 v24.2 2024-05-20 23:08:50 -04:00
Jay D Dee
4f930574cc v24.1 2024-04-16 21:31:35 -04:00
147 changed files with 1570 additions and 1125 deletions

View File

@@ -16,6 +16,7 @@ bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
cpu-miner.c \
util.c \
api.c \
@@ -113,7 +114,6 @@ cpuminer_SOURCES = \
algo/lyra2/phi2-4way.c \
algo/lyra2/phi2.c \
algo/m7m/m7m.c \
algo/m7m/magimath.cpp \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -289,7 +289,7 @@ if HAVE_WINDOWS
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lgmp
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)

View File

@@ -75,6 +75,21 @@ If not what makes it happen or not happen?
Change Log
----------
v24.2
x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.
v24.1
#414: fix bug in merkle error handling.
#416: change $nproc to $(nproc) in build scripts.
#420: change some inline function definitions to static inline.
#413: Fix formatting error for share result log when using no-color.
Faster 2 way interleaving.
Cleanup sha256 architecture targetting.
v23.15
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.

View File

@@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )

View File

@@ -99,24 +99,27 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
static inline set_t set_union ( set_t a, set_t b ) { return a | b; }
// return set contained common elements from sets a & b
inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
static inline set_t set_intsec ( set_t a, set_t b) { return a & b; }
// all elements in set a are included in set b
inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
static inline bool set_incl ( set_t a, set_t b ) { return (a & b) == a; }
// no elements in set a are included in set b
inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
static inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
typedef struct
{
@@ -246,7 +249,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/
#if defined(__AVX512F__)
#if defined(SIMD512)
static inline __m512i blamka( __m512i x, __m512i y )
{
@@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];

View File

@@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"
#if !defined(__AVX512F__)
#if !defined(SIMD512)
#if !defined(__AVX2__)

View File

@@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////////
//
@@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
//Blake-256 16 way AVX512

View File

@@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
///////////////////////////////////
//

View File

@@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define B2B8W_G(a, b, c, d, x, y) \
{ \

View File

@@ -15,7 +15,7 @@
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY

View File

@@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Blake2s-256 16 way

View File

@@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
@@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;

View File

@@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY

View File

@@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}
#define BLAKE512_ROUND( R ) \
@@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//
@@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}
#define ROUND_B_2X64(r) \
@@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );
GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
@@ -2137,9 +2137,9 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );
// Round 1
// G0
@@ -2147,34 +2147,34 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,
// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );
// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );
// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );
// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);

View File

@@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////
//

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY

View File

@@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \

View File

@@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32
@@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 64 bit 8 way
typedef struct

View File

@@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-256 16 way 32

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1

View File

@@ -950,7 +950,7 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// BMW-512 8 WAY

View File

@@ -26,7 +26,7 @@ static const uint64_t IV512[] =
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// 4 way 128 is handy to avoid reinterleaving in many algos.
// If reinterleaving is necessary it may be more efficient to use

View File

@@ -6,7 +6,7 @@
#if defined(__AVX2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
struct _cube_4way_context
{

View File

@@ -13,7 +13,7 @@ static void transform( cubehashParam *sp )
int r;
const int rounds = sp->rounds;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
register __m512i x0, x1;

View File

@@ -11,7 +11,7 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
};
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ECHO_SUBBYTES4(state, j) \
state[0][j] = _mm512_aesenc_epi128( state[0][j], k1 ); \

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -696,7 +696,7 @@ static void AddModulo512(const void *a,const void *b,void *c)
static void AddXor512(const void *a,const void *b,void *c)
{
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
casti_m512i( b, 0 ) );
#elif defined(__AVX2__)

View File

@@ -103,7 +103,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\

View File

@@ -95,7 +95,7 @@ static const v128u32_t gr_mask __attribute__ ((aligned (16))) =
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#if defined(__AVX512VL__)
#if defined(VL256)
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define GROESTL_4WAY_VAES 1
#endif

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

View File

@@ -43,7 +43,7 @@
#define SIZE256 (SIZE_512/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];

View File

@@ -42,7 +42,7 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,

View File

@@ -17,7 +17,7 @@
#if defined(__AVX2__) && defined(__VAES__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{

View File

@@ -33,7 +33,7 @@
#define SIZE512 (SIZE_1024/16)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];

View File

@@ -50,7 +50,7 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
0x1d1519111c141810, 0x1f171b131e161a12,

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#define MYRGR_8WAY 1
#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
#define MYRGR_4WAY 1

View File

@@ -382,7 +382,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
#define S1F MF
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi 8 way AVX512
@@ -1122,7 +1122,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_BIG \
do { \
@@ -1501,7 +1501,7 @@ do { /* order is important */ \
sc->h[14] = CE; \
sc->h[15] = CF;
#if defined(__AVX512VL__)
#if defined(VL256)
#define INPUT_8X32 \
{ \

View File

@@ -104,7 +104,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Hamsi-512 8x64

View File

@@ -53,7 +53,7 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
#if defined(VL256)
// ( ~( a ^ b ) ) & c
#define v128_andnotxor( a, b, c ) \
@@ -583,7 +583,7 @@ do { \
// Haval-256 8 way 32 bit avx2
#if defined (__AVX512VL__)
#if defined (VL256)
// ( ~( a ^ b ) ) & c
#define mm256_andnotxor( a, b, c ) \
@@ -882,7 +882,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// ( ~( a ^ b ) ) & c
#define mm512_andnotxor( a, b, c ) \

View File

@@ -107,7 +107,7 @@ void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[32];

View File

@@ -204,7 +204,7 @@ static const uint64_t IV512[] =
(state)->H[15] = h7l; \
} while (0)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define Sb_8W(x0, x1, x2, x3, c) \
{ \
@@ -364,8 +364,7 @@ static const uint64_t IV512[] =
#if defined(__AVX2__)
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#if defined(VL256)
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
@@ -522,7 +521,7 @@ static const uint64_t IV512[] =
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void jh256_8x64_init( jh_8x64_context *sc )
{

View File

@@ -55,7 +55,7 @@
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define KECCAK_8WAY 1
#elif defined(__AVX2__)
#define KECCAK_4WAY 1
@@ -12,7 +12,7 @@
#define KECCAK_2WAY 1
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA3D_8WAY 1
#elif defined(__AVX2__)
#define SHA3D_4WAY 1

View File

@@ -57,7 +57,7 @@ static const uint64_t RC[] = {
#define DO(x) x
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define INPUT_BUF(size) do { \
size_t j; \

View File

@@ -4,7 +4,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -59,7 +59,7 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define cns4w(i) mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )
@@ -524,8 +524,7 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \

View File

@@ -51,7 +51,7 @@
#define LIMIT_512 128
/*********************************/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
uint32_t buffer[8*4];

View File

@@ -28,8 +28,7 @@
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define MULT2( a0, a1 ) \
{ \
@@ -69,8 +68,7 @@
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#if defined(VL256)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -68,4 +68,4 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___
#endif // LUFFA_FOR_SSE2_H__

View File

@@ -15,7 +15,7 @@
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
@@ -49,7 +49,7 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2REV2_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_8WAY 1
@@ -108,7 +108,7 @@ bool lyra2h_thread_init();
/////////////////////////////////////////
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define PHI2_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define PHI2_4WAY 1

View File

@@ -41,7 +41,7 @@
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords

View File

@@ -59,7 +59,7 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );

View File

@@ -3,7 +3,7 @@
#include "lyra2.h"
#include "algo/blake/blake256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LYRA2Z_16WAY 1
#elif defined(__AVX2__)
#define LYRA2Z_8WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo/gost/sph_gost.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "lyra2.h"
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
#include "algo/echo/echo-hash-4way.h"
#elif defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"

View File

@@ -27,7 +27,7 @@
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
{

View File

@@ -43,7 +43,7 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
@@ -150,13 +150,13 @@ static const uint64_t blake2b_IV[8] =
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 24 ); \
b = v128_ror64xor( b, c, 24 ); \
a = v128_add64( a, b ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 63 );
b = v128_ror64xor( b, c, 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
@@ -222,7 +222,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
union _ovly_512
{

View File

@@ -21,7 +21,7 @@
#define EPS1 DBL_EPSILON
#define EPS2 3.0e-11
inline double exp_n( double xt )
static inline double exp_n( double xt )
{
if ( xt < -700.0 )
return 0;
@@ -33,7 +33,7 @@ inline double exp_n( double xt )
return exp( xt );
}
inline double exp_n2( double x1, double x2 )
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
p5 = 37., p6 = 700.;

View File

@@ -1,75 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <iostream>
#include <cfloat>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "magimath.h"
#define EPS1 (std::numeric_limits<double>::epsilon())
#define EPS2 3.0e-11
static void gauleg(double x1, double x2, double x[], double w[], const int n)
{
int m,j,i;
double z1, z, xm, xl, pp, p3, p2, p1;
m=(n+1)/2;
xm=0.5*(x2+x1);
xl=0.5*(x2-x1);
for (i=1;i<=m;i++) {
z=cos(3.141592654*(i-0.25)/(n+0.5));
do {
p1=1.0;
p2=0.0;
for (j=1;j<=n;j++) {
p3=p2;
p2=p1;
p1=((2.0*j-1.0)*z*p2-(j-1.0)*p3)/j;
}
pp=n*(z*p1-p2)/(z*z-1.0);
z1=z;
z=z1-p1/pp;
} while (fabs(z-z1) > EPS2);
x[i]=xm-xl*z;
x[n+1-i]=xm+xl*z;
w[i]=2.0*xl/((1.0-z*z)*pp*pp);
w[n+1-i]=w[i];
}
}
static double GaussianQuad_N(double func(const double), const double a2, const double b2, const int NptGQ)
{
double s=0.0;
#ifdef _MSC_VER
#define SW_DIVS 23
double x[SW_DIVS+1], w[SW_DIVS+1];
#else
double x[NptGQ+1], w[NptGQ+1];
#endif
gauleg(a2, b2, x, w, NptGQ);
for (int j=1; j<=NptGQ; j++) {
s += w[j]*func(x[j]);
}
return s;
}
static double swit_(double wvnmb)
{
return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5)
/ 1034.66 * pow(sin(wvnmb/65.), 2.);
}
uint32_t sw_(int nnounce, int divs)
{
double wmax = ((sqrt((double)(nnounce))*(1.+EPS1))/450+100);
return ((uint32_t)(GaussianQuad_N(swit_, 0., wmax, divs)*(1.+EPS1)*1.e6));
}

View File

@@ -1,54 +0,0 @@
// Copyright (c) 2014 The Magi developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef MAGI_MATH_H
#define MAGI_MATH_H
#include <math.h>
#ifdef __cplusplus
extern "C" {
#endif
uint32_t sw_(int nnounce, int divs);
#ifdef __cplusplus
}
#endif
inline double exp_n(double xt)
{
double p1 = -700.0, p3 = -0.8e-8, p4 = 0.8e-8, p6 = 700.0;
if(xt < p1)
return 0;
else if(xt > p6)
return 1e200;
else if(xt > p3 && xt < p4)
return (1.0 + xt);
else
return exp(xt);
}
// 1 / (1 + exp(x1-x2))
inline double exp_n2(double x1, double x2)
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
double xt = x1 - x2;
if (xt < p1+1.e-200)
return 1.;
else if (xt > p1 && xt < p2 + 1.e-200)
return ( 1. - exp(xt) );
else if (xt > p2 && xt < p3 + 1.e-200)
return ( 1. / (1. + exp(xt)) );
else if (xt > p3 && xt < p4)
return ( 1. / (2. + xt) );
else if (xt > p4 - 1.e-200 && xt < p5)
return ( exp(-xt) / (1. + exp(-xt)) );
else if (xt > p5 - 1.e-200 && xt < p6)
return ( exp(-xt) );
else //if (xt > p6 - 1.e-200)
return 0.;
}
#endif

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define NIST5_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY 1

View File

@@ -71,8 +71,7 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, v128_ornot( a ## n2, a ## n1 ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
@@ -312,7 +311,7 @@ do { \
BUPDATE1_8W( 7, 1 ); \
} while (0)
#if defined(__AVX512VL__)
#if defined(VL256)
#define GAMMA_8W(n0, n1, n2, n4) \
( g ## n0 = _mm256_ternarylogic_epi32( a ## n0, a ## n2, a ## n1, 0x4b ) )

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define ANIME_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUARK_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY 1

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define QUBIT_4WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define QUBIT_2WAY 1

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define LBRY_16WAY 1
#elif defined(__AVX2__)
#define LBRY_8WAY 1

View File

@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
_mm_xor_si128( v128_ornot( y, x ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
_mm_xor_si128( x, v128_ornot( z, y ) )
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -335,13 +335,13 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
#define F8W_3(x, y, z) \
_mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
_mm256_xor_si256( mm256_ornot( y, x ), z )
#define F8W_4(x, y, z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
#define F8W_5(x, y, z) \
_mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
_mm256_xor_si256( x, mm256_ornot( z, y ) )
#define RR_8W(a, b, c, d, e, f, s, r, k) \
do{ \
@@ -625,7 +625,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
#endif // __AVX2__
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// RIPEMD-160 16 way

View File

@@ -33,7 +33,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
size_t len );
void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -745,7 +745,7 @@ do{ \
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// Tested OK but very slow
// 16 way parallel, requires 16x32 interleaving
@@ -2487,7 +2487,7 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_2BUF;
@@ -2886,7 +2886,7 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
SALSA_8ROUNDS_SIMD128_3BUF;

View File

@@ -5,7 +5,7 @@
#include <stdlib.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );

View File

@@ -35,7 +35,7 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
@@ -592,7 +592,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate,
#endif /* HAVE_SHA256_8WAY */
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
static inline void sha256_16way_init_state( void *state )
{
@@ -1494,7 +1494,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
// scrypt_throughput defined at compile time and used to replace
// MAX_WAYS to reduce memory usage.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// scrypt_throughput = 16;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf

View File

@@ -306,7 +306,7 @@ pbkdf2_sha256_8way( uint8_t *buf, size_t dkLen, const uint8_t *passwd,
}
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// HMAC 16-way AVX512

View File

@@ -84,7 +84,7 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct _hmac_sha256_16way_context
{

View File

@@ -205,7 +205,7 @@ void sha1_x86_sha_transform_be( uint32_t *state_out, const void *input,
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_SHA2)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define sha1_neon_rounds( state_out, data, state_in ) \
{ \

View File

@@ -580,7 +580,7 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(__AVX512VL__)
#if defined(VL256)
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -788,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
G = _mm256_load_si256( state_in + 6 );
H = _mm256_load_si256( state_in + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -830,7 +830,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
G = _mm256_load_si256( state_mid + 6 );
H = _mm256_load_si256( state_mid + 7 );
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
@@ -936,7 +936,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
#if !defined(__AVX512VL__)
#if !defined(VL256)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -981,7 +981,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
#if !defined(VL256)
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
@@ -1172,7 +1172,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
sha256_8way_close( &ctx, dst );
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way

View File

@@ -1,6 +1,6 @@
#include "sha256-hash.h"
#if ( defined(__x86_64__) && defined(__SHA__) ) || defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#if ( defined(__x86_64__) && defined(__SHA__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) )
static const uint32_t SHA256_IV[8] =
{
@@ -10,6 +10,28 @@ static const uint32_t SHA256_IV[8] =
#if defined(__x86_64__) && defined(__SHA__)
/* common code used for rounds 12 through 51 */
#define sha256_generic_qround( s0, s1, m, a, b, c ) \
TMP = _mm_alignr_epi8( a, c, 4 ); \
s1 = _mm_sha256rnds2_epu32( s1, s0, m ); \
b = _mm_add_epi32( b, TMP ); \
b = _mm_sha256msg2_epu32( b, a ); \
m = _mm_shuffle_epi32( m, 0x0e ); \
s0 = _mm_sha256rnds2_epu32( s0, s1, m ); \
c = _mm_sha256msg1_epu32( c, a );
// r12-15
// sha256_generic_qround( s0, s1, m, t3, t0, t2 )
// r16-19
// sha256_generic_qround( s0, s1, m, t0, t1, t3 )
// r20-23
// sha256_generic_qround( s0, s1, m, t1, t2, t0 )
// r24-27
// sha256_generic_qround( s0, s1, m, t2, t3, t1 ) ...
#define sha256_opt_rounds( state_out, input, state_in ) \
{ \
__m128i STATE0, STATE1; \
@@ -189,7 +211,7 @@ static const uint32_t SHA256_IV[8] =
_mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
}
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) casti_v128( m, i )
@@ -197,7 +219,7 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
#undef load_msg
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
void sha256_x86_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
@@ -517,7 +539,7 @@ void sha256_opt_transform_be( uint32_t *state_out, const void *input,
_mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
}
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -526,7 +548,7 @@ void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
#undef load_msg
}
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
void sha256_x86_x2sha_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
@@ -541,7 +563,7 @@ void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
// The goal is to avoid any redundant processing in final. Prehash is almost
// 4 rounds total, only missing the final addition of the nonce.
// Nonce must be set to zero for prehash.
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
void sha256_x86_sha_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate )
{
__m128i STATE0, STATE1, MSG, TMP;
@@ -569,7 +591,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
casti_m128i( ostate, 1 ) = STATE1;
}
void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
void sha256_x86_x2sha_final_rounds( uint32_t *out_X, uint32_t *out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y )
@@ -887,14 +909,14 @@ static const uint32_t K256[64] =
#define sha256_neon_rounds( state_out, input, state_in ) \
{ \
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE; \
uint32x4_t MSG0, MSG1, MSG2, MSG3; \
uint32x4_t TMP0, TMP1, TMP2; \
\
STATE0 = vld1q_u32( state_in ); \
STATE1 = vld1q_u32( state_in+4 ); \
ABEF_SAVE = STATE0; \
CDGH_SAVE = STATE1; \
ABCD_SAVE = STATE0; \
EFGH_SAVE = STATE1; \
\
MSG0 = load_msg( input, 0 ); \
MSG1 = load_msg( input, 1 ); \
@@ -1004,8 +1026,8 @@ static const uint32_t K256[64] =
TMP2 = STATE0; \
STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
STATE0 = vaddq_u32( STATE0, ABCD_SAVE ); \
STATE1 = vaddq_u32( STATE1, EFGH_SAVE ); \
vst1q_u32( state_out , STATE0 ); \
vst1q_u32( state_out+4, STATE1 ); \
}
@@ -1029,8 +1051,8 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
#define sha256_neon_x2sha_rounds( state_out_X, state_out_Y, input_X, \
input_Y, state_in_X, state_in_Y ) \
{ \
uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
uint32x4_t STATE0_X, STATE1_X, ABCD_SAVE_X, EFGH_SAVE_X; \
uint32x4_t STATE0_Y, STATE1_Y, ABCD_SAVE_Y, EFGH_SAVE_Y; \
uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
@@ -1040,10 +1062,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vld1q_u32( state_in_Y ); \
STATE1_X = vld1q_u32( state_in_X+4 ); \
STATE1_Y = vld1q_u32( state_in_Y+4 ); \
ABEF_SAVE_X = STATE0_X; \
ABEF_SAVE_Y = STATE0_Y; \
CDGH_SAVE_X = STATE1_X; \
CDGH_SAVE_Y = STATE1_Y; \
ABCD_SAVE_X = STATE0_X; \
ABCD_SAVE_Y = STATE0_Y; \
EFGH_SAVE_X = STATE1_X; \
EFGH_SAVE_Y = STATE1_Y; \
\
MSG0_X = load_msg( input_X, 0 ); \
MSG0_Y = load_msg( input_Y, 0 ); \
@@ -1245,10 +1267,10 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
STATE0_X = vaddq_u32( STATE0_X, ABCD_SAVE_X ); \
STATE0_Y = vaddq_u32( STATE0_Y, ABCD_SAVE_Y ); \
STATE1_X = vaddq_u32( STATE1_X, EFGH_SAVE_X ); \
STATE1_Y = vaddq_u32( STATE1_Y, EFGH_SAVE_Y ); \
vst1q_u32( state_out_X , STATE0_X ); \
vst1q_u32( state_out_Y , STATE0_Y ); \
vst1q_u32( state_out_X+4, STATE1_X ); \

View File

@@ -5,27 +5,21 @@
#include "simd-utils.h"
#include "cpuminer-config.h"
// generic interface
static const uint32_t SHA256_IV[8];
#if defined(__x86_64__) && defined(__SHA__)
typedef struct
{
unsigned char buf[64]; /* first field, for alignment */
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
static const uint32_t SHA256_IV[8];
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__x86_64__) && defined(__SHA__)
void sha256_x86_sha_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
@@ -50,14 +44,6 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_opt_transform_le sha256_x86_sha_transform_le
#define sha256_opt_transform_be sha256_x86_sha_transform_be
#define sha256_ni2x_transform_le sha256_x86_x2sha_transform_le
#define sha256_ni2x_transform_be sha256_x86_x2sha_transform_be
#define sha256_ni_prehash_3rounds sha256_x86_sha_prehash_3rounds
#define sha256_ni2x_final_rounds sha256_x86_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_x86_sha_transform_le
#define sha256_transform_be sha256_x86_sha_transform_be
@@ -68,6 +54,20 @@ void sha256_x86_x2sha_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
// SHA-256 AArch64 with NEON & SHA2
typedef struct
{
unsigned char buf[64];
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
void sha256_ctx_init( sha256_context *ctx );
void sha256_neon_sha_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
@@ -89,14 +89,6 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Temporary during name transition
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
@@ -106,9 +98,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
#else
// without HW acceleration...
#include "sph_sha2.h"
#define sha256_context sph_sha256_context
#define sha256_full sph_sha256_full
#define sha256_ctx_init sph_sha256_init
#define sha256_update sph_sha256
@@ -117,12 +111,11 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_transform_be sph_sha256_transform_be
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-256 16 way
// SHA-256 16 way x86_64
typedef struct
{
@@ -147,7 +140,7 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_context sha256_16x32_context
#define sha256_16way_init sha256_16x32_init
#define sha256_16way_update sha256_16x32_update
#define sha256_16way_close sha256_16x32_close
@@ -162,7 +155,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
#if defined (__AVX2__)
// SHA-256 8 way
// SHA-256 8 way x86_64
typedef struct
{
@@ -201,7 +194,7 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
#endif // AVX2
// SHA-256 4 way
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
{

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1

View File

@@ -6,7 +6,7 @@
#include "sha256-hash.h"
#include "sph_sha2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256DT_16X32 1
#elif defined(__x86_64__) && defined(__SHA__)
#define SHA256DT_X86_SHA256 1

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA256T_16WAY 1
#elif defined(__SHA__)
#define SHA256T_SHA 1

View File

@@ -73,29 +73,10 @@ static const uint64_t K512[80] =
// Experimental. Not tested. Not reviewed. Compile tested only.
// Needs GCC-13 for compilation.
// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
// Needs GCC-14 for compilation.
// Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution.
// Modelled after noloader sha256 implementation.
// It's not clear how SHA512 will be supported before AVX10 considering how
// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
// until AVX10-256.
#if defined(__AVX512VL__)
#define mm256_alignr_1x64( v1, v0 ) _mm256_alignr_epi64( v1, v0, 1 )
#else
// Ugly workaround to make it work with AVX2
static const __m256i mask __attribute__ ((aligned (32)))
= { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
#define mm256_alignr_1x64( v1, v0 ) \
_mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ), mask ), \
_mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
#endif
void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
@@ -109,7 +90,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) )
0x0001020304050607 ) );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -123,153 +104,233 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128 (MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1,
_mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -289,7 +350,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i MSG, TMP;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
@@ -308,141 +369,190 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_shuffle2_64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi32( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 16-19
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 20-23
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 24-27
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi32( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 28-31
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 32-35
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 36-39
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 40-43
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 44-47
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 48-51
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 52-55
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG0 = _mm256_sha512msg1_epi64( TMSG0, _mm256_castsi256_si128( TMSG1 ) );
// Rounds 56-59
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG1 = _mm256_sha512msg1_epi64( TMSG1, _mm256_castsi256_si128( TMSG2 ) );
// Rounds 60-63
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG3, TMSG2, 1 );
TMSG0 = _mm256_add_epi64( TMSG0, TMP );
TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG2 = _mm256_sha512msg1_epi64( TMSG2, _mm256_castsi256_si128( TMSG3 ) );
// Rounds 64-67
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 16 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG0, TMSG3, 1 );
TMSG1 = _mm256_add_epi64( TMSG1, TMP );
TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
TMSG3 = _mm256_sha512msg1_epi64( TMSG3, _mm256_castsi256_si128( TMSG0 ) );
// Rounds 68-71
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 17 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG1, TMSG0, 1 );
TMSG2 = _mm256_add_epi64( TMSG2, TMP );
TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 72-75
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 18 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
TMP = mm256_alignr64( TMSG2, TMSG1, 1 );
TMSG3 = _mm256_add_epi64( TMSG3, TMP );
TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Rounds 76-79
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 19 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) );
MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, _mm256_castsi256_si128( MSG ) );
// Add initial state
STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
@@ -462,7 +572,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-512 8 way 64 bit
@@ -664,8 +774,7 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
@@ -717,7 +826,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
int i;
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z;
#endif
@@ -754,7 +863,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] )
H = v256_64( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
#if !defined(VL256)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C );
#endif

View File

@@ -25,7 +25,7 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input,
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// SHA-512 8 way

View File

@@ -4,7 +4,7 @@
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1

View File

@@ -34,7 +34,7 @@
#include <string.h>
#include "shabal-hash-4way.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define DECL_STATE16 \
__m512i A0, A1, A2, A3, A4, A5, A6, A7, \

View File

@@ -8,7 +8,7 @@
#define SPH_SIZE_shabal512 512
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct {
__m512i buf[16];

View File

@@ -30,8 +30,7 @@ static const uint32_t IV512[] =
#endif
#if defined (__AVX512VL__)
//TODO Enable for AVX10_256
#if defined (VL256)
#define DECL_m256i_count \
const __m256i count = \

View File

@@ -1,7 +1,7 @@
#include "shavite-hash-4way.h"
#include <stdint.h>
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__VAES__) && defined(SIMD512)
static const uint32_t IV512[] =
{

View File

@@ -1,10 +1,10 @@
#ifndef SHAVITE_HASH_4WAY_H__
#define SHAVITE_HASH_4WAY_H__ 1
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#include "simd-utils.h"
#if defined(__VAES__) && defined(SIMD512)
typedef struct {
unsigned char buf[128<<2];
uint32_t h[16<<2];

View File

@@ -803,8 +803,7 @@ static const m256_v16 FFT256_Twiddle[] =
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(VL256)
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
@@ -1500,7 +1499,7 @@ int simd512_2way( void *hashval, const void *data, int datalen )
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
////////////////////////////////////
//

View File

@@ -52,7 +52,7 @@ int simd512_2way( void *hashval, const void *data, int datalen );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define SKEIN_8WAY 1
#elif defined(__AVX2__)
#define SKEIN_4WAY 1

View File

@@ -298,7 +298,7 @@ static const uint64_t IV512[] = {
sc->bcount = bcount;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \
@@ -511,7 +511,7 @@ do { \
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void skein256_8way_init( skein256_8way_context *sc )
{

View File

@@ -44,7 +44,7 @@
#include <stddef.h>
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
typedef struct
{

View File

@@ -687,7 +687,7 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#undef ADD_SUB
#if defined (__AVX512VL__) && defined(__AVX512BW__)
#if defined(VL256)
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \
@@ -1233,7 +1233,7 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
swift_int32_t result[N] __attribute__ ((aligned (64)));
register swift_int16_t carry = 0;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
__m512i *res = (__m512i*)result;
for ( j = 0; j < N/16; ++j )

View File

@@ -152,7 +152,7 @@ void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
return md;
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
void sha3_8way_keccakf( __m512i st[25] )
{

View File

@@ -37,7 +37,7 @@ int sha3_4way_final( void *md, sha3_4way_ctx_t *c ); // digest goes to md
void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
// state context
typedef struct

Some files were not shown because too many files have changed in this diff Show More