This commit is contained in:
Jay D Dee
2019-12-21 13:19:29 -05:00
parent a17ff6f189
commit c65b0ff7a6
72 changed files with 9090 additions and 1336 deletions

View File

@@ -1,12 +1,14 @@
Requirements:
1. Requirements:
---------------
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux operating system. Apple is not supported.
Building on linux prerequisites:
2. Building on linux prerequisites:
-----------------------------------
It is assumed users know how to install packages on their system and
be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
Install any additional dependencies needed by cpuminer-opt. The list below
are some of the ones that may not be in the default install and need to
be installed manually. There may be others, read the error messages they
will give a clue as to the missing package.
be installed manually. There may be others, read the compiler error messages,
they will give a clue as to the missing package.
The following command should install everything you need on Debian based
distributions such as Ubuntu. Fedora and other distributions may have similar
but different package names.
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
openssl 1.1.0e or higher. Add one of the following, depending on the
compiler version, to CFLAGS:
"-march=native" or "-march=znver1" or "-msha".
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
support depending on your CPU and compiler version:
"-march=native" is always the best choice
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
"-msha" Add SHA to other tuning options
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer
Static builds should only considered in a homogeneous HW and SW environment.
Local builds will always have the best performance and compatibility.
Extract cpuminer source.
3. Download cpuminer-opt
------------------------
tar xvzf cpuminer-opt-x.y.z.tar.gz
cd cpuminer-opt-x.y.z
Download the source code for the latest realease from the official repository.
Run ./build.sh to build on Linux or execute the following commands.
https://github.com/JayDDee/cpuminer-opt/releases
./autogen.sh
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make
Extract the source code.
Start mining.
$ tar xvzf cpuminer-opt-x.y.z.tar.gz
Alternatively it can be cloned from git.
$ git clone https://github.com/JayDDee/cpuminer-opt.git
4. Build cpuminer-opt
---------------------
It is recomended to Build with default options, this will usuallly
produce the best results.
$ ./build.sh to build on Linux or execute the following commands.
or
$ ./autogen.sh
$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
$ make -j n
n is the number of threads.
5. Start mining.
----------------
$ ./cpuminer -a algo -o url -u username -p password
./cpuminer -a algo -o url -u username -p password
Windows
-------
See also INSTAL_WINDOWS
The following procedure is obsolete and uses an old compiler.
Precompiled Windows binaries are built on a Linux host using Mingw
with a more recent compiler than the following Windows hosted procedure.

View File

@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
algo/luffa/luffa-hash-2way.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/sponge-2way.c \
algo/lyra2/lyra2-hash-2way.c \
algo/lyra2/lyra2-gate.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2rev2-4way.c \

View File

@@ -1,6 +1,8 @@
cpuminer-opt is a console program run from the command line using the
keyboard, not the mouse.
See also README.md for list of supported algorithms,
Security warning
----------------
@@ -31,7 +33,21 @@ not supported. FreeBSD YMMV.
Change Log
----------
v3.10.2
v3.10.5
AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2.
Faster hmq1725 AVX2.
v3.10.4
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
v3.10.3
AVX512 for x12, x13, x14, x15.
Fixed x12 AVX2 invalid shares.
v.10.2
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
Fixed c11 AVX2 invalid shares.

View File

@@ -184,9 +184,9 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
#include <immintrin.h>
#define rotr32 mm256_swap32_64
#define rotr24 mm256_ror3x8_64
#define rotr16 mm256_ror1x16_64
#define rotr32( x ) mm256_ror_64( x, 32 )
#define rotr24( x ) mm256_ror_64( x, 24 )
#define rotr16( x ) mm256_ror_64( x, 16 )
#define rotr63( x ) mm256_rol_64( x, 1 )
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))

View File

@@ -70,19 +70,22 @@ typedef struct {
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *ctx);
void blake256_4way(void *ctx, const void *data, size_t len);
void blake256_4way_update(void *ctx, const void *data, size_t len);
#define blake256_4way blake256_4way_update
void blake256_4way_close(void *ctx, void *dst);
// 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way(void *cc, const void *data, size_t len);
void blake256r14_4way_update(void *cc, const void *data, size_t len);
#define blake256r14_4way blake256r14_4way_update
void blake256r14_4way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_4way_small_context blake256r8_4way_context;
void blake256r8_4way_init(void *cc);
void blake256r8_4way(void *cc, const void *data, size_t len);
void blake256r8_4way_update(void *cc, const void *data, size_t len);
#define blake256r8_4way blake256r8_4way_update
void blake256r8_4way_close(void *cc, void *dst);
#ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
// Default 14 rounds
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way(void *cc, const void *data, size_t len);
void blake256_8way_update(void *cc, const void *data, size_t len);
#define blake256_8way blake256_8way_update
void blake256_8way_close(void *cc, void *dst);
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
void blake256r14_8way_init(void *cc);
void blake256r14_8way(void *cc, const void *data, size_t len);
void blake256r14_8way_update(void *cc, const void *data, size_t len);
void blake256r14_8way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_8way_small_context blake256r8_8way_context;
void blake256r8_8way_init(void *cc);
void blake256r8_8way(void *cc, const void *data, size_t len);
void blake256r8_8way_update(void *cc, const void *data, size_t len);
#define blake256r8_8way blake256r8_8way_update
void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way

View File

@@ -634,7 +634,7 @@ do { \
m256_const1_64( 0x082EFA98082EFA98 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
}
void
blake256_16way_close_update(void *cc, void *dst)
blake256_16way_close(void *cc, void *dst)
{
blake32_16way_close(cc, 0, 0, dst, 8);
}
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
}
void
blake256_8way(void *cc, const void *data, size_t len)
blake256_8way_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
}
void
blake256r14_4way(void *cc, const void *data, size_t len)
blake256r14_4way_update(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
}
void
blake256r14_8way(void *cc, const void *data, size_t len)
blake256r14_8way_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
}
void
blake256r8_4way(void *cc, const void *data, size_t len)
blake256r8_4way_update(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
}
void
blake256r8_8way(void *cc, const void *data, size_t len)
blake256r8_8way_update(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}

View File

@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
#define DH1L( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
_mm256_srli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH1R( m, sl, sr, a, b, c ) \
_mm256_add_epi32( \
_mm256_xor_si256( M[m], \
_mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
_mm256_slli_epi32( qt[a], sr ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
#define DH2L( m, rl, sl, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
#define DH2R( m, rl, sr, h, a, b, c ) \
_mm256_add_epi32( _mm256_add_epi32( \
mm256_rol_32( dH[h], rl ), \
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
_mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
_mm256_xor_si256( qt[b], qt[c] ) ) );
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
#undef DH1L
#undef DH1R
#undef DH2L
#undef DH2R
/*
dH[ 0] = _mm256_add_epi32(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
_mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
_mm256_xor_si256( qt[22], qt[15] ) ) );
*/
}
static const __m256i final_s8[16] =

View File

@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
x1 = _mm512_xor_si512( x1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap64_128( x4 );
x5 = mm512_swap64_128( x5 );
x6 = mm512_swap64_128( x6 );
x7 = mm512_swap64_128( x7 );
x4 = mm512_swap128_64( x4 );
x5 = mm512_swap128_64( x5 );
x6 = mm512_swap128_64( x6 );
x7 = mm512_swap128_64( x7 );
x4 = _mm512_add_epi32( x0, x4 );
x5 = _mm512_add_epi32( x1, x5 );
x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
x1 = _mm512_xor_si512( x1, x5 );
x2 = _mm512_xor_si512( x2, x6 );
x3 = _mm512_xor_si512( x3, x7 );
x4 = mm512_swap32_64( x4 );
x5 = mm512_swap32_64( x5 );
x6 = mm512_swap32_64( x6 );
x7 = mm512_swap32_64( x7 );
x4 = mm512_swap64_32( x4 );
x5 = mm512_swap64_32( x5 );
x6 = mm512_swap64_32( x6 );
x7 = mm512_swap64_32( x7 );
}
_mm512_store_si512( (__m512i*)sp->h, x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_128( x4 );
x5 = mm256_swap64_128( x5 );
x6 = mm256_swap64_128( x6 );
x7 = mm256_swap64_128( x7 );
x4 = mm256_swap128_64( x4 );
x5 = mm256_swap128_64( x5 );
x6 = mm256_swap128_64( x6 );
x7 = mm256_swap128_64( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap32_64( x4 );
x5 = mm256_swap32_64( x5 );
x6 = mm256_swap32_64( x6 );
x7 = mm256_swap32_64( x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
x6 = mm256_swap64_32( x6 );
x7 = mm256_swap64_32( x7 );
}
_mm256_store_si256( (__m256i*)sp->h, x0 );

View File

@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
x1 = mm256_rol_32( y0, 7 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap64_128( x2 );
x3 = mm256_swap64_128( x3 );
x2 = mm256_swap128_64( x2 );
x3 = mm256_swap128_64( x3 );
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
x1 = mm256_rol_32( y1, 11 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap32_64( x2 );
x3 = mm256_swap32_64( x3 );
x2 = mm256_swap64_32( x2 );
x3 = mm256_swap64_32( x3 );
}
_mm256_store_si256( (__m256i*)sp->x, x0 );

View File

@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
SPH_C32(0xe7e00a94) }
};
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi 8 way
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
dm = mm512_negate_32( _mm512_or_si512( dm, \
_mm512_slli_epi64( dm, 32 ) ) ); \
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
m512_const1_64( tp[0] ) ) ); \
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
m512_const1_64( tp[1] ) ) ); \
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
m512_const1_64( tp[2] ) ) ); \
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
m512_const1_64( tp[3] ) ) ); \
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
m512_const1_64( tp[4] ) ) ); \
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
m512_const1_64( tp[5] ) ) ); \
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
m512_const1_64( tp[6] ) ) ); \
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
m512_const1_64( tp[7] ) ) ); \
tp += 8; \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
#define SBOX8( a, b, c, d ) \
do { \
__m512i t; \
t = a; \
a = _mm512_and_si512( a, c ); \
a = _mm512_xor_si512( a, d ); \
c = _mm512_xor_si512( c, b ); \
c = _mm512_xor_si512( c, a ); \
d = _mm512_or_si512( d, t ); \
d = _mm512_xor_si512( d, b ); \
t = _mm512_xor_si512( t, c ); \
b = d; \
d = _mm512_or_si512( d, t ); \
d = _mm512_xor_si512( d, a ); \
a = _mm512_and_si512( a, b ); \
t = _mm512_xor_si512( t, a ); \
b = _mm512_xor_si512( b, d ); \
b = _mm512_xor_si512( b, t ); \
a = c; \
c = b; \
b = d; \
d = mm512_not( t ); \
} while (0)
#define L8( a, b, c, d ) \
do { \
a = mm512_rol_32( a, 13 ); \
c = mm512_rol_32( c, 3 ); \
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
_mm512_slli_epi32( a, 3 ) ) ); \
b = mm512_rol_32( b, 1 ); \
d = mm512_rol_32( d, 7 ); \
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
_mm512_slli_epi32( b, 7 ) ) ); \
a = mm512_rol_32( a, 5 ); \
c = mm512_rol_32( c, 22 ); \
} while (0)
#define DECL_STATE_BIG8 \
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
#define READ_STATE_BIG8(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
} while (0)
#define WRITE_STATE_BIG8(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
} while (0)
#define ROUND_BIG8(rc, alpha) \
do { \
__m512i t0, t1, t2, t3; \
s0 = _mm512_xor_si512( s0, m512_const1_64( \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
\
SBOX8( s0, s4, s8, sC ); \
SBOX8( s1, s5, s9, sD ); \
SBOX8( s2, s6, sA, sE ); \
SBOX8( s3, s7, sB, sF ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
_mm512_bslli_epi128( s5, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
_mm512_bslli_epi128( sE, 4 ) ); \
L8( s0, t1, s9, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( s6, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
_mm512_bslli_epi128( sF, 4 ) ); \
L8( s1, t1, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
_mm512_bslli_epi128( s7, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
_mm512_bslli_epi128( sC, 4 ) ); \
L8( s2, t1, sB, t3 ); \
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
\
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
_mm512_bslli_epi128( s4, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
L8( s3, t1, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
_mm512_bslli_epi128( sB, 4 ) ); \
L8( t0, t1, t2, t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
_mm512_bslli_epi128( sD, 4 ) ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
L8( t0, t1, t2, t3 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
} while (0)
#define P_BIG8 \
do { \
ROUND_BIG8(0, alpha_n); \
ROUND_BIG8(1, alpha_n); \
ROUND_BIG8(2, alpha_n); \
ROUND_BIG8(3, alpha_n); \
ROUND_BIG8(4, alpha_n); \
ROUND_BIG8(5, alpha_n); \
} while (0)
#define PF_BIG8 \
do { \
ROUND_BIG8( 0, alpha_f); \
ROUND_BIG8( 1, alpha_f); \
ROUND_BIG8( 2, alpha_f); \
ROUND_BIG8( 3, alpha_f); \
ROUND_BIG8( 4, alpha_f); \
ROUND_BIG8( 5, alpha_f); \
ROUND_BIG8( 6, alpha_f); \
ROUND_BIG8( 7, alpha_f); \
ROUND_BIG8( 8, alpha_f); \
ROUND_BIG8( 9, alpha_f); \
ROUND_BIG8(10, alpha_f); \
ROUND_BIG8(11, alpha_f); \
} while (0)
#define T_BIG8 \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
} while (0)
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
{
DECL_STATE_BIG8
uint32_t tmp = num << 6;
sc->count_low = SPH_T32( sc->count_low + tmp );
sc->count_high += (sph_u32)( (num >> 13) >> 13 );
if ( sc->count_low < tmp )
sc->count_high++;
READ_STATE_BIG8( sc );
while ( num-- > 0 )
{
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
INPUT_BIG8;
P_BIG8;
T_BIG8;
buf++;
}
WRITE_STATE_BIG8( sc );
}
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
{
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_BIG8
READ_STATE_BIG8( sc );
INPUT_BIG8;
PF_BIG8;
T_BIG8;
WRITE_STATE_BIG8( sc );
}
void hamsi512_8way_init( hamsi_8way_big_context *sc )
{
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
sc->h[0] = m512_const1_64( 0x6c70617273746565 );
sc->h[1] = m512_const1_64( 0x656e62656b204172 );
sc->h[2] = m512_const1_64( 0x302c206272672031 );
sc->h[3] = m512_const1_64( 0x3434362c75732032 );
sc->h[4] = m512_const1_64( 0x3030312020422d33 );
sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
sc->h[7] = m512_const1_64( 0x6769756d2042656c );
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
hamsi_8way_big( sc, vdata, len>>3 );
vdata += ( (len& ~(size_t)7) >> 3 );
len &= (size_t)7;
memcpy_512( sc->buf, vdata, len>>3 );
sc->partial_len = len;
}
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
{
__m512i pad[1];
int ch, cl;
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
cl, ch, cl, ch, cl, ch, cl, ch );
// pad[0] = m512_const2_32( cl, ch );
sc->buf[0] = m512_const1_64( 0x80 );
hamsi_8way_big( sc, sc->buf, 1 );
hamsi_8way_big_final( sc, pad );
mm512_block_bswap_32( (__m512i*)dst, sc->h );
}
#endif // AVX512
// Hamsi 4 way
#define INPUT_BIG \
do { \
@@ -627,6 +967,7 @@ do { \
sc->h[0x7] = c7; \
} while (0)
/*
#define s0 m0
#define s1 c0
#define s2 m1
@@ -643,42 +984,28 @@ do { \
#define sD m6
#define sE c7
#define sF m7
*/
#define ROUND_BIG(rc, alpha) \
do { \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, m256_const1_64( \
( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
s1 = _mm256_xor_si256( s1, m256_const1_64( \
( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
s2 = _mm256_xor_si256( s2, m256_const1_64( \
( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
s3 = _mm256_xor_si256( s3, m256_const1_64( \
( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
s4 = _mm256_xor_si256( s4, m256_const1_64( \
( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
s5 = _mm256_xor_si256( s5, m256_const1_64( \
( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
s6 = _mm256_xor_si256( s6, m256_const1_64( \
( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
s7 = _mm256_xor_si256( s7, m256_const1_64( \
( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
s8 = _mm256_xor_si256( s8, m256_const1_64( \
( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
s9 = _mm256_xor_si256( s9, m256_const1_64( \
( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
sA = _mm256_xor_si256( sA, m256_const1_64( \
( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
sB = _mm256_xor_si256( sB, m256_const1_64( \
( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
sC = _mm256_xor_si256( sC, m256_const1_64( \
( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
sD = _mm256_xor_si256( sD, m256_const1_64( \
( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
sE = _mm256_xor_si256( sE, m256_const1_64( \
( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
sF = _mm256_xor_si256( sF, m256_const1_64( \
( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
\
SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \

View File

@@ -60,9 +60,32 @@ typedef struct {
typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
size_t len );
#define hamsi512_4way hamsi512_4way_update
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
__m512i h[8];
__m512i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
} hamsi_8way_big_context;
typedef hamsi_8way_big_context hamsi512_8way_context;
void hamsi512_8way_init( hamsi512_8way_context *sc );
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
size_t len );
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
#endif
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,115 @@
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
/*
* Helper code, included (three times !) by HAVAL implementation.
*
* TODO: try to merge this with md_helper.c.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
( haval_8way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
while ( len > 0 )
{
unsigned clen;
uint32_t clow, clow2;
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
if ( current == 128U )
{
DSTATE_8W;
IN_PREPARE_8W(sc->buf);
RSTATE_8W;
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
WSTATE_8W;
current = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high ++;
}
}
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
void *dst)
{
unsigned current;
DSTATE_8W;
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = m256_one_32;
current += 4;
RSTATE_8W;
if ( current > 116UL )
{
memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE_8W(sc->buf);
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
} while (0);
current = 0;
}
uint32_t t1, t2;
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{
IN_PREPARE_8W(sc->buf);
SPH_XCAT(CORE_8W, PASSES)(INW_8W);
} while (0);
WSTATE_8W;
haval_8way_out( sc, dst );
}

View File

@@ -40,7 +40,7 @@
#include <string.h>
#include "haval-hash-4way.h"
// won't compile with sse4.2
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
//#if defined (__SSE4_2__)
#if defined(__AVX__)
@@ -518,6 +518,301 @@ do { \
#define INMSG(i) msg[i]
#if defined(__AVX2__)
// Haval-256 8 way 32 bit avx2
#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( x0, \
_mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
_mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
_mm256_and_si256( x3, x6 ) ) ) ) \
#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x2, \
_mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
_mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
_mm256_xor_si256( x6, x0 ) ) ) ), \
_mm256_xor_si256( \
_mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
_mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x3, \
_mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
_mm256_xor_si256( x6, x0 ) ) ), \
_mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
_mm256_and_si256( x2, x5 ) ), x0 ) )
#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_xor_si256( \
_mm256_and_si256( x3, \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
_mm256_or_si256( x4, x6 ) ), x5 ) ), \
_mm256_and_si256( x4, \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
_mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
_mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
_mm256_xor_si256( \
_mm256_and_si256( x0, \
mm256_not( _mm256_xor_si256( \
_mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
_mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
_mm256_and_si256( x2, x5 ) ), \
_mm256_and_si256( x3, x6 ) ) )
#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x1, x0, x3, x5, x6, x2, x4)
#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x4, x2, x1, x0, x5, x3, x6)
#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x6, x1, x2, x3, x4, x5, x0)
#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x2, x6, x1, x4, x5, x3, x0)
#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x3, x5, x2, x0, x1, x6, x4)
#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x1, x4, x3, x6, x0, x2, x5)
#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
F4_8W(x6, x4, x0, x5, x2, x1, x3)
#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
F1_8W(x3, x4, x1, x0, x5, x2, x6)
#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
F2_8W(x6, x2, x1, x0, x3, x4, x5)
#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
F3_8W(x2, x6, x0, x4, x3, x1, x5)
#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
F4_8W(x1, x5, x3, x2, x0, x4, x6)
#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
F5_8W(x2, x5, x0, x6, x4, x3, x1)
#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
mm256_ror_32( x7, 11 ) ), \
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
} while (0)
#define PASS1_8W(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6), SPH_C32(0x00000000)); \
STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7), SPH_C32(0x00000000)); \
} \
} while (0)
#define PASSG_8W(p, n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
in(MP ## p[pass_count + 0]), \
RK ## p[pass_count + 0]); \
STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
in(MP ## p[pass_count + 1]), \
RK ## p[pass_count + 1]); \
STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
in(MP ## p[pass_count + 2]), \
RK ## p[pass_count + 2]); \
STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
in(MP ## p[pass_count + 3]), \
RK ## p[pass_count + 3]); \
STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
in(MP ## p[pass_count + 4]), \
RK ## p[pass_count + 4]); \
STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
in(MP ## p[pass_count + 5]), \
RK ## p[pass_count + 5]); \
STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
in(MP ## p[pass_count + 6]), \
RK ## p[pass_count + 6]); \
STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
in(MP ## p[pass_count + 7]), \
RK ## p[pass_count + 7]); \
} \
} while (0)
#define PASS2_8W(n, in) PASSG_8W(2, n, in)
#define PASS3_8W(n, in) PASSG_8W(3, n, in)
#define PASS4_8W(n, in) PASSG_8W(4, n, in)
#define PASS5_8W(n, in) PASSG_8W(5, n, in)
#define SAVE_STATE_8W \
__m256i u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
u2 = s2; \
u3 = s3; \
u4 = s4; \
u5 = s5; \
u6 = s6; \
u7 = s7; \
} while (0)
#define UPDATE_STATE_8W \
do { \
s0 = _mm256_add_epi32( s0, u0 ); \
s1 = _mm256_add_epi32( s1, u1 ); \
s2 = _mm256_add_epi32( s2, u2 ); \
s3 = _mm256_add_epi32( s3, u3 ); \
s4 = _mm256_add_epi32( s4, u4 ); \
s5 = _mm256_add_epi32( s5, u5 ); \
s6 = _mm256_add_epi32( s6, u6 ); \
s7 = _mm256_add_epi32( s7, u7 ); \
} while (0)
#define CORE_8W5(in) do { \
SAVE_STATE_8W; \
PASS1_8W(5, in); \
PASS2_8W(5, in); \
PASS3_8W(5, in); \
PASS4_8W(5, in); \
PASS5_8W(5, in); \
UPDATE_STATE_8W; \
} while (0)
#define DSTATE_8W __m256i s0, s1, s2, s3, s4, s5, s6, s7
#define RSTATE_8W \
do { \
s0 = sc->s0; \
s1 = sc->s1; \
s2 = sc->s2; \
s3 = sc->s3; \
s4 = sc->s4; \
s5 = sc->s5; \
s6 = sc->s6; \
s7 = sc->s7; \
} while (0)
#define WSTATE_8W \
do { \
sc->s0 = s0; \
sc->s1 = s1; \
sc->s2 = s2; \
sc->s3 = s3; \
sc->s4 = s4; \
sc->s5 = s5; \
sc->s6 = s6; \
sc->s7 = s7; \
} while (0)
static void
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = m256_const1_32( 0x243F6A88UL );
sc->s1 = m256_const1_32( 0x85A308D3UL );
sc->s2 = m256_const1_32( 0x13198A2EUL );
sc->s3 = m256_const1_32( 0x03707344UL );
sc->s4 = m256_const1_32( 0xA4093822UL );
sc->s5 = m256_const1_32( 0x299F31D0UL );
sc->s6 = m256_const1_32( 0x082EFA98UL );
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
sc->count_low = 0;
}
#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
#define INW_8W(i) load_ptr_8w[ i ]
static void
haval_8way_out( haval_8way_context *sc, void *dst )
{
__m256i *buf = (__m256i*)dst;
DSTATE_8W;
RSTATE_8W;
buf[0] = s0;
buf[1] = s1;
buf[2] = s2;
buf[3] = s3;
buf[4] = s4;
buf[5] = s5;
buf[6] = s6;
buf[7] = s7;
}
#undef PASSES
#define PASSES 5
#include "haval-8way-helper.c"
#define API_8W(xxx, y) \
void \
haval ## xxx ## _ ## y ## _8way_init(void *cc) \
{ \
haval_8way_init(cc, xxx >> 5, y); \
} \
\
void \
haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
{ \
haval ## y ## _8way_update(cc, data, len); \
} \
\
void \
haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
{ \
haval ## y ## _8way_close(cc, dst); \
} \
API_8W(256, 5)
#define RVAL_8W \
do { \
s0 = val[0]; \
s1 = val[1]; \
s2 = val[2]; \
s3 = val[3]; \
s4 = val[4]; \
s5 = val[5]; \
s6 = val[6]; \
s7 = val[7]; \
} while (0)
#define WVAL_8W \
do { \
val[0] = s0; \
val[1] = s1; \
val[2] = s2; \
val[3] = s3; \
val[4] = s4; \
val[5] = s5; \
val[6] = s6; \
val[7] = s7; \
} while (0)
#define INMSG_8W(i) msg[i]
#endif // AVX2
#ifdef __cplusplus
}
#endif

View File

@@ -59,7 +59,7 @@
*/
#ifndef HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__ 1
#if defined(__AVX__)
@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;
void haval256_5_4way_init( void *cc );
void haval256_5_4way( void *cc, const void *data, size_t len );
void haval256_5_4way_update( void *cc, const void *data, size_t len );
#define haval256_5_4way haval256_5_4way_update
void haval256_5_4way_close( void *cc, void *dst );
#if defined(__AVX2__)
typedef struct {
__m256i buf[32];
__m256i s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
uint32_t count_high, count_low;
} haval_8way_context __attribute__ ((aligned (64)));
typedef haval_8way_context haval256_5_8way_context;
void haval256_5_8way_init( void *cc );
void haval256_5_8way_update( void *cc, const void *data, size_t len );
void haval256_5_8way_close( void *cc, void *dst );
#endif // AVX2
#ifdef __cplusplus
}
#endif

View File

@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int size = ROW_LEN_BYTES * 4; // nRows;
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
#if defined(LYRA2REV3_16WAY)
// l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
init_lyra2rev3_16way_ctx();;
#else
l2v3_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV3_8WAY)
init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
init_lyra2rev3_4way_ctx();;
#else
init_lyra2rev3_ctx();
#endif
#endif
return l2v3_wholeMatrix;
}
bool register_lyra2rev3_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV3_8WAY)
#if defined(LYRA2REV3_16WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_16way;
gate->hash = (void*)&lyra2rev3_16way_hash;
#elif defined (LYRA2REV3_8WAY)
gate->scanhash = (void*)&scanhash_lyra2rev3_8way;
gate->hash = (void*)&lyra2rev3_8way_hash;
#elif defined (LYRA2REV3_4WAY)
@@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2rev3;
gate->hash = (void*)&lyra2rev3_hash;
#endif
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
opt_target_factor = 256.0;
return true;
@@ -85,10 +94,14 @@ bool lyra2rev2_thread_init()
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
#if defined (LYRA2REV2_8WAY)
l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 ); // 2 way
init_lyra2rev2_8way_ctx();;
#elif defined (LYRA2REV2_4WAY)
l2v2_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();;
#else
l2v2_wholeMatrix = _mm_malloc( size, 64 );
init_lyra2rev2_ctx();
#endif
return l2v2_wholeMatrix;
@@ -96,14 +109,17 @@ bool lyra2rev2_thread_init()
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
#if defined (LYRA2REV2_8WAY)
gate->scanhash = (void*)&scanhash_lyra2rev2_8way;
gate->hash = (void*)&lyra2rev2_8way_hash;
#elif defined (LYRA2REV2_4WAY)
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
opt_target_factor = 256.0;
return true;

View File

@@ -5,18 +5,27 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__)
#define LYRA2REV3_8WAY
#endif
#if defined(__SSE2__)
#define LYRA2REV3_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV3_16WAY 1
#elif defined(__AVX2__)
#define LYRA2REV3_8WAY 1
#elif defined(__SSE2__)
#define LYRA2REV3_4WAY 1
#endif
extern __thread uint64_t* l2v3_wholeMatrix;
bool register_lyra2rev3_algo( algo_gate_t* gate );
#if defined(LYRA2REV3_8WAY)
#if defined(LYRA2REV3_16WAY)
void lyra2rev3_16way_hash( void *state, const void *input );
int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev3_16way_ctx();
#elif defined(LYRA2REV3_8WAY)
void lyra2rev3_8way_hash( void *state, const void *input );
int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
@@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx();
//////////////////////////////////
#if defined(__AVX2__)
#define LYRA2REV2_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define LYRA2REV2_8WAY 1
#elif defined(__AVX2__)
#define LYRA2REV2_4WAY 1
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
#if defined(LYRA2REV2_8WAY)
void lyra2rev2_8way_hash( void *state, const void *input );
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_8way_ctx();
#elif defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

View File

@@ -26,6 +26,19 @@
#include "lyra2.h"
#include "sponge.h"
// LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
//
// LYRA2REV2 4 cols 4 rows used by lyra2rev2.
//
// LYRA2REV3 4 cols 4 rows with an extra twist in calculating
// rowa in the wandering phase. Used by lyra2rev3.
//
// LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
// lyra2z330, lyra2h,
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
@@ -46,176 +59,137 @@
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
// For lyra2rev3.
// convert a simple offset to an index into interleaved data.
// good for state and 4 row matrix.
// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
#define offset_to_index( o ) \
( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
// int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
uint64_t _ALIGN(256) state[32];
int64_t row = 2;
int64_t prev = 1;
int64_t rowa0 = 0;
int64_t rowa1 = 0;
int64_t tau;
int64_t step = 1;
int64_t window = 2;
int64_t gap = 1;
//====================================================================/
//=== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
uint64_t *ptrWord = wholeMatrix;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
uint64_t *ptr = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
memcpy( ptr, pw, 2*pwdlen ); // password
ptr += pwdlen>>2;
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
ptr += pwdlen>>2;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
// now build the rest interleaving on the fly.
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
// from here on it's all simd acces to state and matrix
// define vector pointers and adjust sizes and pointer offsets
//================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
// initState( state );
//========================= Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptr[0] = ptr[ 4] = kLen;
ptr[1] = ptr[ 5] = pwdlen;
ptr[2] = ptr[ 6] = pwdlen; // saltlen
ptr[3] = ptr[ 7] = timeCost;
ptr[8] = ptr[12] = nRows;
ptr[9] = ptr[13] = nCols;
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
*/
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
reducedDuplexRow1_2way( state, &wholeMatrix[0],
&wholeMatrix[ 2 * ROW_LEN_INT64 ], nCols );
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
&wholeMatrix[ 2* row*ROW_LEN_INT64],
nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
rowa0 = (rowa0 + step) & (window - 1);
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
if ( rowa0 == 0 )
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
step = window + gap;
window *= 2;
gap = -gap;
}
} while (row < nRows);
} while ( row < nRows );
//===================== Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++)
row = 0;
for ( tau = 1; tau <= timeCost; tau++ )
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
do
{
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
nCols );
prev = row;
//updates row: goes to the next row to be computed
//----------------------------------------------------
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//----------------------------------------------------
} while (row != 0);
}
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
squeeze_2way( state, K, (unsigned int) kLen );
return 0;
}
// This version is currently only used by REv3 and has some hard coding
// specific to v3 such as input data size of 32 bytes.
//
// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
// they can be merged.
//
// RE is used by RE, allium. The main difference between RE and REv2
// in the matrix size.
//
// Z also needs to support 80 byte input as well as 32 byte, and odd
// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
/////////////////////////////////////////////////
// 2 way 256
@@ -223,22 +197,29 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
// Data is interleaved 2x256.
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
const void *pwd, const uint64_t pwdlen, const void *salt,
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
const void *pwd, uint64_t pwdlen, uint64_t timeCost,
uint64_t nRows, uint64_t nCols )
// hard coded for 32 byte input as well as matrix size.
// Other required versions include 80 byte input and different block
// sizez
//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// const void *pwd, const uint64_t pwdlen, const void *salt,
// const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
// const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
// int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
uint64_t instance0 = 0; // Seperate instance for each lane
uint64_t _ALIGN(256) state[32];
int64_t row = 2;
int64_t prev = 1;
int64_t rowa0 = 0;
int64_t rowa1 = 0;
int64_t tau;
int64_t step = 1;
int64_t window = 2;
int64_t gap = 1;
uint64_t instance0 = 0;
uint64_t instance1 = 0;
//====================================================================/
@@ -248,7 +229,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
uint64_t *ptrWord = wholeMatrix;
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
// need to build in parallel:
// need to build in parallel as pw isalready interleaved.
// { password, (64 or 80 bytes)
// salt, (64 or 80 bytes) = same as password
// Klen, (u64) = 32 bytes
@@ -262,73 +245,54 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
// 1 (byte)
// }
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
// input is usually 32 maybe 64, both are aligned to 256 bit vector.
// 80 byte inpput is not aligned complicating matters for lyra2z.
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
byte *ptrByte = (byte*) wholeMatrix;
uint64_t *ptr = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
memcpy( ptr, pw, 2*pwdlen ); // password
ptr += pwdlen>>2;
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
ptr += pwdlen>>2;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
// now build the rest interleaving on the fly.
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
// from here on it's all simd acces to state and matrix
// define vector pointers and adjust sizes and pointer offsets
ptr[0] = ptr[ 4] = kLen;
ptr[1] = ptr[ 5] = pwdlen;
ptr[2] = ptr[ 6] = pwdlen; // saltlen
ptr[3] = ptr[ 7] = timeCost;
ptr[8] = ptr[12] = nRows;
ptr[9] = ptr[13] = nCols;
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
reducedDuplexRow1_2way( state, &wholeMatrix[0],
&wholeMatrix[2*ROW_LEN_INT64], nCols );
do
{
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
nCols );
rowa = (rowa + step) & (window - 1);
rowa0 = (rowa0 + step) & (window - 1);
prev = row;
row++;
if (rowa == 0)
if (rowa0 == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
@@ -340,37 +304,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
row = 0;
for (tau = 1; tau <= timeCost; tau++)
{
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
do
{
// This part is not parallel, rowa will be different for each lane.
// state (u64[16]) is interleaved 2x256, need to extract seperately.
instance0 = state[ offset_to_index( instance0 ) ];
instance1 = (&state[4])[ offset_to_index( instance1 ) ];
// index = 2 * instance / 4 * 4 + instance % 4
uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
+ ( instance0 & 0x3 )
uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
+ ( instance1 & 0x3 )
rowa0 = state[ offset_to_index( instance0 ) ]
& (unsigned int)(nRows-1);
rowa1 = (state+4)[ offset_to_index( instance1 ) ]
& (unsigned int)(nRows-1);
instance0 = state[ index0 ] & 0xf;
instance1 = (state+4)[ index1 ] & 0xf;
rowa0 = state[ instance0 ];
rowa1 = (state+4)[ instance1 ];
reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa0*ROW_LEN_INT64],
&wholeMatrix[rowa1*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
/*
instance = state[instance & 0xF];
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
*/
// End of divergence.
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
nCols );
prev = row;
row = (row + step) & (unsigned int)(nRows-1);
@@ -378,13 +327,17 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
} while ( row != 0 );
}
absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
squeeze( state, K, (unsigned int) kLen );
absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
&wholeMatrix[2*rowa1*ROW_LEN_INT64] );
squeeze_2way( state, K, (unsigned int) kLen );
return 0;
}
#endif // AVX512
#if 0
//////////////////////////////////////////////////
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
@@ -532,22 +485,26 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
return 0;
}
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
const uint64_t pwdlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t rowa0 = 0;
int64_t rowa1 = 0;
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
//====================================================================/
//=== Initializing the Memory Matrix and pointers to it =============//
@@ -573,15 +530,36 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
#endif
uint64_t *ptrWord = wholeMatrix;
uint64_t *pw = (uint64_t*)pwd;
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
uint64_t *ptr = wholeMatrix;
memcpy( ptr, pw, 2*pwdlen ); // password
ptr += pwdlen>>2;
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
ptr += pwdlen>>2;
// now build the rest interleaving on the fly.
ptr[0] = ptr[ 4] = kLen;
ptr[1] = ptr[ 5] = pwdlen;
ptr[2] = ptr[ 6] = pwdlen; // saltlen
ptr[3] = ptr[ 7] = timeCost;
ptr[8] = ptr[12] = nRows;
ptr[9] = ptr[13] = nCols;
ptr[10] = ptr[14] = 0x80;
ptr[11] = ptr[15] = 0x0100000000000000;
/*
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
@@ -630,7 +608,9 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
ptrWord = wholeMatrix;
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
*/
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
/*
for (i = 0; i < nBlocksInput; i++)
{
@@ -639,21 +619,22 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
}
*/
//Initializes M[0] and M[1]
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
reducedDuplexRow1_2way( state, &wholeMatrix[0],
&wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
nCols );
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
rowa0 = (rowa0 + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
@@ -661,7 +642,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
row++;
//Checks if all rows in the window where visited.
if (rowa == 0)
if (rowa0 == 0)
{
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
@@ -674,21 +655,18 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++)
{
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
do
{
//Selects a pseudorandom index row*
//-----------------------------------------------
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//-------------------------------------------
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
nCols );
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
&wholeMatrix[rowa*ROW_LEN_INT64],
&wholeMatrix[row*ROW_LEN_INT64], nCols );
//update prev: it now points to the last row ever computed
prev = row;
@@ -703,9 +681,10 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
//===================== Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
squeeze_2way( state, K, (unsigned int) kLen );
//================== Freeing the memory =============================//
_mm_free(wholeMatrix);
@@ -713,3 +692,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
return 0;
}
#endif

View File

@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
nCols);
do
{
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

View File

@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
#endif
#endif /* LYRA2_H_ */

View File

@@ -1,13 +1,150 @@
#include "lyra2-gate.h"
#include <memory.h>
#if defined (LYRA2REV2_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#if defined (LYRA2REV2_8WAY)
typedef struct {
blake256_8way_context blake;
keccak256_8way_context keccak;
cube_4way_context cube;
skein256_8way_context skein;
bmw256_8way_context bmw;
} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
bool init_lyra2rev2_8way_ctx()
{
keccak256_8way_init( &l2v2_8way_ctx.keccak );
cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
skein256_8way_init( &l2v2_8way_ctx.skein );
bmw256_8way_init( &l2v2_8way_ctx.bmw );
return true;
}
void lyra2rev2_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
blake256_8way( &ctx.blake, input + (64<<3), 16 );
blake256_8way_close( &ctx.blake, vhash );
rintrlv_8x32_8x64( vhashA, vhash, 256 );
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
keccak256_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
skein256_8way_update( &ctx.skein, vhash, 32 );
skein256_8way_close( &ctx.skein, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, 256 );
bmw256_8way_update( &ctx.bmw, vhash, 32 );
bmw256_8way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
__m256i *noncev = (__m256i*)vdata + 19; // aligned
int thr_id = mythr->id;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake256_8way_init( &l2v2_8way_ctx.blake );
blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
do
{
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
lyra2rev2_8way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (LYRA2REV2_4WAY)
typedef struct {
blake256_4way_context blake;

View File

@@ -4,8 +4,180 @@
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#if defined (LYRA2REV3_8WAY)
#if defined (LYRA2REV3_16WAY)
typedef struct {
blake256_16way_context blake;
cube_4way_context cube;
bmw256_16way_context bmw;
} lyra2v3_16way_ctx_holder;
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
bool init_lyra2rev3_16way_ctx()
{
blake256_16way_init( &l2v3_16way_ctx.blake );
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
bmw256_16way_init( &l2v3_16way_ctx.bmw );
return true;
}
void lyra2rev3_16way_hash( void *state, const void *input )
{
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t hash8[8] __attribute__ ((aligned (64)));
uint32_t hash9[8] __attribute__ ((aligned (64)));
uint32_t hash10[8] __attribute__ ((aligned (64)));
uint32_t hash11[8] __attribute__ ((aligned (64)));
uint32_t hash12[8] __attribute__ ((aligned (64)));
uint32_t hash13[8] __attribute__ ((aligned (64)));
uint32_t hash14[8] __attribute__ ((aligned (64)));
uint32_t hash15[8] __attribute__ ((aligned (64)));
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
blake256_16way_close( &ctx.blake, vhash );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
cube_4way_init( &ctx.cube, 256, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
intrlv_2x256( vhash, hash0, hash1, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash0, hash1, vhash, 256 );
intrlv_2x256( vhash, hash2, hash3, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash2, hash3, vhash, 256 );
intrlv_2x256( vhash, hash4, hash5, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash4, hash5, vhash, 256 );
intrlv_2x256( vhash, hash6, hash7, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash6, hash7, vhash, 256 );
intrlv_2x256( vhash, hash8, hash9, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash8, hash9, vhash, 256 );
intrlv_2x256( vhash, hash10, hash11, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash10, hash11, vhash, 256 );
intrlv_2x256( vhash, hash12, hash13, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash12, hash13, vhash, 256 );
intrlv_2x256( vhash, hash14, hash15, 256 );
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
dintrlv_2x256( hash14, hash15, vhash, 256 );
intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
hash15, 256 );
bmw256_16way_update( &ctx.bmw, vhash, 32 );
bmw256_16way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
uint32_t *hash7 = &hash[7<<4];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 16;
const uint32_t Htarg = ptarget[7];
__m512i *noncev = (__m512i*)vdata + 19; // aligned
const int thr_id = mythr->id;
if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake256_16way_init( &l2v3_16way_ctx.blake );
blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
do
{
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4,
n+ 3, n+ 2, n+ 1, n ) );
lyra2rev3_16way_hash( hash, vdata );
pdata[19] = n;
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{
extr_lane_16x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (LYRA2REV3_8WAY)
typedef struct {
blake256_8way_context blake;

View File

@@ -19,7 +19,7 @@
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "algo-gate.h"
//#include "algo-gate.h"
#include <string.h>
#include <stdio.h>
#include <time.h>
@@ -40,18 +40,25 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
//Squeezes full blocks
for ( i = 0; i < fullBlocks; i++ )
{
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
out += BLOCK_LEN_M256I*2;
memcpy_512( out, state, BLOCK_LEN_M256I );
LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
out += BLOCK_LEN_M256I;
}
//Squeezes remaining bytes
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
}
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
const uint64_t *In1 )
{
register __m512i state0, state1, state2, state3;
__m512i *in = (__m512i*)In;
__m512i in[3];
casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
state0 = _mm512_load_si512( (__m512i*)State );
state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -90,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
state1 = _mm512_xor_si512( state1, in[1] );
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
In += block_len * 2;
In += block_len*2;
}
_mm512_store_si512( (__m512i*)State, state0 );
@@ -109,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
register __m512i state0, state1, state2, state3;
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
state0 = _mm512_load_si512( (__m512i*)State );
state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -132,7 +139,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
out[2] = state2;
//Goes to next block (column) that will receive the squeezed data
out -= BLOCK_LEN_M256I * 2;
out -= BLOCK_LEN_M256I;
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
}
@@ -143,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
_mm512_store_si512( (__m512i*)State + 3, state3 );
}
// This function has to deal with gathering 2 256 bit rowin vectors from
// non-contiguous memory. Extra work and performance penalty.
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols )
{
int i;
register __m512i state0, state1, state2, state3;
__m512i *in = (__m256i*)rowIn;
__m512i *in = (__m512i*)rowIn;
__m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
state0 = _mm512_load_si512( (__m512i*)State );
state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -171,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
out[2] = _mm512_xor_si512( state2, in[2] );
//Input: next column (i.e., next block in sequence)
in0 += BLOCK_LEN_M256I;
in1 += BLOCK_LEN_M256I;
in += BLOCK_LEN_M256I;
//Output: goes to previous column
out -= BLOCK_LEN_M256I * 2;
out -= BLOCK_LEN_M256I;
}
_mm512_store_si256( (__m512i*)State, state0 );
_mm512_store_si256( (__m512i*)State + 1, state1 );
_mm512_store_si256( (__m512i*)State + 2, state2 );
_mm512_store_si256( (__m512i*)State + 3, state3 );
}
_mm512_store_si512( (__m512i*)State, state0 );
_mm512_store_si512( (__m512i*)State + 1, state1 );
_mm512_store_si512( (__m512i*)State + 2, state2 );
_mm512_store_si512( (__m512i*)State + 3, state3 );
}
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
{
int i;
register __m512i state0, state1, state2, state3;
__m512i* in = (__m512i*)rowIn;
__m512i* inout = (__m512i*)rowInOut;
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
__m512i t0, t1, t2;
state0 = _mm512_load_si512( (__m512i*)State );
@@ -209,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
state2 = _mm512_xor_si512( state2,
_mm512_add_epi64( in[2], inout[2] ) );
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
out[0] = _mm512_xor_si512( state0, in[0] );
out[1] = _mm512_xor_si512( state1, in[1] );
@@ -221,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
t2 = _mm512_permutex_epi64( state2, 0x93 );
inout[0] = _mm512_xor_si512( inout[0],
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
inout[1] = _mm512_xor_si512( inout[1],
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
inout[2] = _mm512_xor_si512( inout[2],
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
//Inputs: next column (i.e., next block in sequence)
in += BLOCK_LEN_M256I * 2;
inout += BLOCK_LEN_M256I * 2;
in += BLOCK_LEN_M256I;
inout += BLOCK_LEN_M256I;
//Output: goes to previous column
out -= BLOCK_LEN_M256I * 2;
out -= BLOCK_LEN_M256I;
}
_mm512_store_si512( (__m512i*)State, state0 );
@@ -240,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
_mm512_store_si512( (__m512i*)State + 3, state3 );
}
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
uint64_t nCols )
// big ugly workaound for pointer aliasing, use a union of pointers.
// Access matrix using m512i for in and out, m256i for inout
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut0, uint64_t *rowInOut1,
uint64_t *rowOut, uint64_t nCols)
{
int i;
register __m512i state0, state1, state2, state3;
__m256i *in0 = (__m256i*)rowIn0;
__m256i *in0 = (__m256i*)rowIn0;
__m2512* in = (__m512i*)rowIn;
__m2512* inout = (__m512i*)rowInOut;
__m512i* out = (__m512i*)rowOut;
__m512i *in = (__m512i*)rowIn;
__m256i *inout0 = (__m256i*)rowInOut0;
__m256i *inout1 = (__m256i*)rowInOut1;
__m512i *out = (__m512i*)rowOut;
__m512i io[3];
povly inout;
inout.v512 = &io[0];
__m512i t0, t1, t2;
_mm_prefetch( in0, _MM_HINT_T0 );
_mm_prefetch( in1, _MM_HINT_T0 );
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
state0 = _mm512_load_si512( (__m512i*)State );
state1 = _mm512_load_si512( (__m512i*)State + 1 );
state2 = _mm512_load_si512( (__m512i*)State + 2 );
state3 = _mm512_load_si512( (__m512i*)State + 3 );
//Absorbing "M[prev] [+] M[row*]"
_mm_prefetch( in, _MM_HINT_T0 );
_mm_prefetch( inout0, _MM_HINT_T0 );
_mm_prefetch( inout1, _MM_HINT_T0 );
_mm_prefetch( in + 2, _MM_HINT_T0 );
_mm_prefetch( inout0 + 2, _MM_HINT_T0 );
_mm_prefetch( inout1 + 2, _MM_HINT_T0 );
_mm_prefetch( in + 4, _MM_HINT_T0 );
_mm_prefetch( inout0 + 4, _MM_HINT_T0 );
_mm_prefetch( inout1 + 4, _MM_HINT_T0 );
_mm_prefetch( in + 6, _MM_HINT_T0 );
_mm_prefetch( inout0 + 6, _MM_HINT_T0 );
_mm_prefetch( inout1 + 6, _MM_HINT_T0 );
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
t0 = mm512_concat_256( in1[0], in0[0] );
t1 = mm512_concat_256( in1[1], in0[1] );
t2 = mm512_concat_256( in1[2], in0[2] );
for ( i = 0; i < nCols; i++ )
{
//Absorbing "M[prev] [+] M[row*]"
inout.v256[0] = inout0[0];
inout.v256[1] = inout1[1];
inout.v256[2] = inout0[2];
inout.v256[3] = inout1[3];
inout.v256[4] = inout0[4];
inout.v256[5] = inout1[5];
state0 = _mm512_xor_si512( state0,
_mm512_add_epi64( t0, inout[0] ) );
_mm512_add_epi64( in[0], inout.v512[0] ) );
state1 = _mm512_xor_si512( state1,
_mm512_add_epi64( t1, inout[1] ) );
_mm512_add_epi64( in[1], inout.v512[1] ) );
state2 = _mm512_xor_si512( state2,
_mm512_add_epi64( t2, inout[2] ) );
_mm512_add_epi64( in[2], inout.v512[2] ) );
//Applies the reduced-round transformation f to the sponge's state
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -292,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
out[1] = _mm512_xor_si512( out[1], state1 );
out[2] = _mm512_xor_si512( out[2], state2 );
// if inout is the same row as out it was just overwritten, reload.
if ( rowOut == rowInOut0 )
{
inout.v256[0] = inout0[0];
inout.v256[2] = inout0[2];
inout.v256[4] = inout0[4];
}
if ( rowOut == rowInOut1 )
{
inout.v256[1] = inout1[1];
inout.v256[3] = inout1[3];
inout.v256[5] = inout1[5];
}
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
t0 = _mm512_permutex_epi64( state0, 0x93 );
t1 = _mm512_permutex_epi64( state1, 0x93 );
t2 = _mm512_permutex_epi64( state2, 0x93 );
inout[0] = _mm512_xor_si512( inout[0],
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
inout[1] = _mm512_xor_si512( inout[1],
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
inout[2] = _mm512_xor_si512( inout[2],
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
inout.v512[0] = _mm512_xor_si512( inout.v512[0],
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
inout.v512[1] = _mm512_xor_si512( inout.v512[1],
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
inout.v512[2] = _mm512_xor_si512( inout.v512[2],
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
inout0[0] = inout.v256[0];
inout1[1] = inout.v256[1];
inout0[2] = inout.v256[2];
inout1[3] = inout.v256[3];
inout0[4] = inout.v256[4];
inout1[5] = inout.v256[5];
//Goes to next block
in += BLOCK_LEN_M256I * 2;
out += BLOCK_LEN_M256I * 2;
inout += BLOCK_LEN_M256I * 2;
in += BLOCK_LEN_M256I;
inout0 += BLOCK_LEN_M256I * 2;
inout1 += BLOCK_LEN_M256I * 2;
out += BLOCK_LEN_M256I;
}
_mm512_store_si512( (__m512i*)State, state0 );

View File

@@ -376,6 +376,9 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
_mm_prefetch( out - 9, _MM_HINT_T0 );
_mm_prefetch( out - 11, _MM_HINT_T0 );
//printf("S RSR0 col= %d, out= %x\n",i,out);
out[0] = state0;
out[1] = state1;
out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out[1] = _mm256_xor_si256( state1, in[1] );
out[2] = _mm256_xor_si256( state2, in[2] );
/*
printf("s duplexsetup col= %d\n",i);
uint64_t * o = (uint64_t*)out;
printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
//M[row*][col] = M[row*][col] XOR rotW(rand)
t0 = _mm256_permute4x64_epi64( state0, 0x93 );
t1 = _mm256_permute4x64_epi64( state1, 0x93 );
t2 = _mm256_permute4x64_epi64( state2, 0x93 );
/*
uint64_t *t = (uint64_t*)&t0;
printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
o = (uint64_t*)inout;
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
inout[0] = _mm256_xor_si256( inout[0],
_mm256_blend_epi32( t0, t2, 0x03 ) );
inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
inout[2] = _mm256_xor_si256( inout[2],
_mm256_blend_epi32( t2, t1, 0x03 ) );
//Inputs: next column (i.e., next block in sequence)
/*
o = (uint64_t*)inout;
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
*/
//Inputs: next column (i.e., next block in sequence)
in += BLOCK_LEN_M256I;
inout += BLOCK_LEN_M256I;
//Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
_mm_prefetch( inout + 9, _MM_HINT_T0 );
_mm_prefetch( inout + 11, _MM_HINT_T0 );
/*
uint64_t *io = (uint64_t*)inout;
uint64_t *ii = (uint64_t*)in;
printf("RDRS1 col= %d\n", i);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
*/
//Absorbing "M[prev] [+] M[row*]"
state0 = _mm256_xor_si256( state0,
_mm256_add_epi64( in[0], inout[0] ) );

View File

@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm512_ror_1x64( s1); \
s2 = mm512_swap128_256( s2 ); \
s3 = mm512_rol1x64_256( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm512_rol1x64_256( s1 ); \
s2 = mm512_swap128_256( s2 ); \
s3 = mm512_ror1x64_256( s3 );
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_ror256_64( s1); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_rol256_64( s3 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s1 = mm512_rol256_64( s1 ); \
s2 = mm512_swap256_128( s2 ); \
s3 = mm512_ror256_64( s3 );
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_ror1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_rol1x64_256( s6, s7 ); \
mm128_ror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_rol256_64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_ror1x64_256( s6, s7 );
mm128_rol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_ror256_64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
union _povly
{
__m512i *v512;
__m256i *v256;
uint64_t *u64;
};
typedef union _povly povly;
//---- Housekeeping
void initState_2way( uint64_t state[/*16*/] );
void initState_2way( uint64_t State[/*16*/] );
//---- Squeezes
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
//---- Absorbs
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
const uint64_t *In1 );
void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
const uint64_t nBlocks, const uint64_t block_len );
//---- Duplexes
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
uint64_t *rowInOut0, uint64_t *rowInOut1,
uint64_t *rowOut, uint64_t nCols);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,10 @@
bool register_hmq1725_algo( algo_gate_t* gate )
{
#if defined(HMQ1725_4WAY)
#if defined(HMQ1725_8WAY)
gate->scanhash = (void*)&scanhash_hmq1725_8way;
gate->hash = (void*)&hmq1725_8way_hash;
#elif defined(HMQ1725_4WAY)
gate->scanhash = (void*)&scanhash_hmq1725_4way;
gate->hash = (void*)&hmq1725_4way_hash;
#else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hmq1725;
gate->hash = (void*)&hmq1725hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 65536.0;
return true;
};

View File

@@ -4,13 +4,21 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
// #define HMQ1725_4WAY 1
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define HMQ1725_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define HMQ1725_4WAY 1
#endif
bool register_hmq1725_algo( algo_gate_t* gate );
#if defined(HMQ1725_4WAY)
#if defined(HMQ1725_8WAY)
void hmq1725_8way_hash( void *state, const void *input );
int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(HMQ1725_4WAY)
void hmq1725_4way_hash( void *state, const void *input );
int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,

View File

@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFFFF)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFFF0)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFFF00)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFFF000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
if (((hash64[7]&0xFFFF0000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
hmq1725hash(hash64, endiandata);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
work_set_target_ratio( work, hash64 );
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input )
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
// AVX 512 cmpeq returns a bit mask instead of a vector mask.
// This should simplify things but the logic doesn't seem to be working.
// The problem appears to be related to the test to skip a hash if it isn't
// to be used. Skipping the test for all 8 way hashes seems to have
// fixed it. The hash selection blending works if the hash is produced
// but the hash wasn't being produced when it should.
// Both decisions are based on the same data, the __mmask8. It works
// as a blend mask but not in a logical comparison, maybe the type is the
// problem. Maybe a cast to int or movm is needed to make it work.
// It's now moot because the hash can only be skipped 1 in 256 iterations
// when hashing parallel 8 ways.
// The performance impact of the workaround should be negligible.
// It's a problem for another day.
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
zero );

View File

@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
{
uint32_t hash[4*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];

View File

@@ -56,7 +56,7 @@ typedef struct {
__m128i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_4way_context;
} sha256_4way_context __attribute__ ((aligned (64)));
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
__m256i val[8];
uint32_t count_high, count_low;
bool initialized;
} sha256_8way_context;
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,30 +86,32 @@ typedef struct {
__m256i val[8];
uint64_t count;
bool initialized;
} sha512_4way_context;
} sha512_4way_context __attribute__ ((aligned (128)));
void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
void sha512_4way_update( sha512_4way_context *sc, const void *data,
size_t len );
#define sha512_4way sha512_4way_update
void sha512_4way_close( sha512_4way_context *sc, void *dst );
// SHA-256 11 way hybrid
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
typedef struct {
__m256i bufx[64>>2];
__m256i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_11way_context;
__m512i buf[128>>3];
__m512i val[8];
uint64_t count;
bool initialized;
} sha512_8way_context __attribute__ ((aligned (128)));
void sha256_11way_init( sha256_11way_context *ctx );
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len );
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
void *dstz );
void sha512_8way_init( sha512_8way_context *sc);
void sha512_8way_update( sha512_8way_context *sc, const void *data,
size_t len );
void sha512_8way_close( sha512_8way_context *sc, void *dst );
#endif // AVX512
#endif // __AVX2__
#endif // __SSE2__
#endif // SHA256_4WAY_H__

View File

@@ -36,8 +36,6 @@
#include <string.h>
#include "sha-hash-4way.h"
// SHA-512 4 way 64 bit
/*
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way 64 bit
#define CH8W(X, Y, Z) \
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
#define MAJ8W(X, Y, Z) \
_mm512_or_si512( _mm512_and_si512( X, Y ), \
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
#define BSG8W_5_0(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
#define BSG8W_5_1(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
#define SSG8W_5_0(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
#define SSG8W_5_1(x) \
_mm512_xor_si512( _mm512_xor_si512( \
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
{
__m512i w0a, w1a, w0b, w1b;
w0a = mm512_ror_64( w0, 1 );
w1a = mm512_ror_64( w1,19 );
w0b = mm512_ror_64( w0, 8 );
w1b = mm512_ror_64( w1,61 );
w0a = _mm512_xor_si512( w0a, w0b );
w1a = _mm512_xor_si512( w1a, w1b );
w0b = _mm512_srli_epi64( w0, 7 );
w1b = _mm512_srli_epi64( w1, 6 );
w0a = _mm512_xor_si512( w0a, w0b );
w1a = _mm512_xor_si512( w1a, w1b );
return _mm512_add_epi64( w0a, w1a );
}
#define SSG8W_512x2_0( w0, w1, i ) do \
{ \
__m512i X0a, X1a, X0b, X1b; \
X0a = mm512_ror_64( W[i-15], 1 ); \
X1a = mm512_ror_64( W[i-14], 1 ); \
X0b = mm512_ror_64( W[i-15], 8 ); \
X1b = mm512_ror_64( W[i-14], 8 ); \
X0a = _mm512_xor_si512( X0a, X0b ); \
X1a = _mm512_xor_si512( X1a, X1b ); \
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
w0 = _mm512_xor_si512( X0a, X0b ); \
w1 = _mm512_xor_si512( X1a, X1b ); \
} while(0)
#define SSG8W_512x2_1( w0, w1, i ) do \
{ \
__m512i X0a, X1a, X0b, X1b; \
X0a = mm512_ror_64( W[i-2],19 ); \
X1a = mm512_ror_64( W[i-1],19 ); \
X0b = mm512_ror_64( W[i-2],61 ); \
X1b = mm512_ror_64( W[i-1],61 ); \
X0a = _mm512_xor_si512( X0a, X0b ); \
X1a = _mm512_xor_si512( X1a, X1b ); \
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
w0 = _mm512_xor_si512( X0a, X0b ); \
w1 = _mm512_xor_si512( X1a, X1b ); \
} while(0)
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m512i T1, T2; \
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
K, W[i] ) ); \
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
D = _mm512_add_epi64( D, T1 ); \
H = _mm512_add_epi64( T1, T2 ); \
} while (0)
static void
sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
{
int i;
register __m512i A, B, C, D, E, F, G, H;
__m512i W[80];
mm512_block_bswap_64( W , in );
mm512_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = m512_const1_64( 0x6A09E667F3BCC908 );
B = m512_const1_64( 0xBB67AE8584CAA73B );
C = m512_const1_64( 0x3C6EF372FE94F82B );
D = m512_const1_64( 0xA54FF53A5F1D36F1 );
E = m512_const1_64( 0x510E527FADE682D1 );
F = m512_const1_64( 0x9B05688C2B3E6C1F );
G = m512_const1_64( 0x1F83D9ABFB41BD6B );
H = m512_const1_64( 0x5BE0CD19137E2179 );
}
for ( i = 0; i < 80; i += 8 )
{
SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
if ( ctx->initialized )
{
r[0] = _mm512_add_epi64( r[0], A );
r[1] = _mm512_add_epi64( r[1], B );
r[2] = _mm512_add_epi64( r[2], C );
r[3] = _mm512_add_epi64( r[3], D );
r[4] = _mm512_add_epi64( r[4], E );
r[5] = _mm512_add_epi64( r[5], F );
r[6] = _mm512_add_epi64( r[6], G );
r[7] = _mm512_add_epi64( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
}
}
void sha512_8way_init( sha512_8way_context *sc )
{
sc->initialized = false;
sc->count = 0;
}
void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
{
__m512i *vdata = (__m512i*)data;
size_t ptr;
const int buf_size = 128;
ptr = (unsigned)sc->count & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha512_8way_round( sc, sc->buf, sc->val );
ptr = 0;
}
sc->count += clen;
}
}
void sha512_8way_close( sha512_8way_context *sc, void *dst )
{
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m512i shuff_bswap64 = m512_const_64(
0x38393a3b3c3d3e3f, 0x3031323334353637,
0x28292a2b2c2d2e2f, 0x2021222324252627,
0x18191a1b1c1d1e1f, 0x1011121314151617,
0x08090a0b0c0d0e0f, 0x0001020304050607 );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
sha512_8way_round( sc, sc->buf, sc->val );
memset_zero_512( sc->buf, pad >> 3 );
}
else
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
sha512_8way_round( sc, sc->buf, sc->val );
mm512_block_bswap_64( dst, sc->val );
}
#endif // AVX512
// SHA-512 4 way 64 bit
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
sc->count = 0;
}
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;

View File

@@ -33,7 +33,7 @@
#include <stddef.h>
#include <string.h>
#ifdef __AVX2__
#ifdef __SSE4_1__
#include "shabal-hash-4way.h"
#ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
#define O2 9
#define O3 6
#if defined(__AVX2__)
#define DECL_STATE8 \
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
A08, A09, A0A, A0B; \
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
C8, C9, CA, CB, CC, CD, CE, CF; \
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
sph_u32 Wlow, Whigh;
#define READ_STATE8(state) do \
{ \
if ( (state)->state_loaded ) \
{ \
A00 = (state)->A[0]; \
A01 = (state)->A[1]; \
A02 = (state)->A[2]; \
A03 = (state)->A[3]; \
A04 = (state)->A[4]; \
A05 = (state)->A[5]; \
A06 = (state)->A[6]; \
A07 = (state)->A[7]; \
A08 = (state)->A[8]; \
A09 = (state)->A[9]; \
A0A = (state)->A[10]; \
A0B = (state)->A[11]; \
B0 = (state)->B[0]; \
B1 = (state)->B[1]; \
B2 = (state)->B[2]; \
B3 = (state)->B[3]; \
B4 = (state)->B[4]; \
B5 = (state)->B[5]; \
B6 = (state)->B[6]; \
B7 = (state)->B[7]; \
B8 = (state)->B[8]; \
B9 = (state)->B[9]; \
BA = (state)->B[10]; \
BB = (state)->B[11]; \
BC = (state)->B[12]; \
BD = (state)->B[13]; \
BE = (state)->B[14]; \
BF = (state)->B[15]; \
C0 = (state)->C[0]; \
C1 = (state)->C[1]; \
C2 = (state)->C[2]; \
C3 = (state)->C[3]; \
C4 = (state)->C[4]; \
C5 = (state)->C[5]; \
C6 = (state)->C[6]; \
C7 = (state)->C[7]; \
C8 = (state)->C[8]; \
C9 = (state)->C[9]; \
CA = (state)->C[10]; \
CB = (state)->C[11]; \
CC = (state)->C[12]; \
CD = (state)->C[13]; \
CE = (state)->C[14]; \
CF = (state)->C[15]; \
} \
else \
{ \
(state)->state_loaded = true; \
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A02 = m256_const1_64( 0xE782B699E782B699 ); \
A03 = m256_const1_64( 0x5530463255304632 ); \
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m256_const1_64( 0x48910A5A48910A5A ); \
B9 = m256_const1_64( 0x893B22DB893B22DB ); \
BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
BC = m256_const1_64( 0x72D2F24072D2F240 ); \
BD = m256_const1_64( 0x75941D9975941D99 ); \
BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
C2 = m256_const1_64( 0x56028CB256028CB2 ); \
C3 = m256_const1_64( 0x8134F3598134F359 ); \
C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m256_const1_64( 0x0405278004052780 ); \
C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
C9 = m256_const1_64( 0x5194358F5194358F ); \
CA = m256_const1_64( 0x3C60D6653C60D665 ); \
CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
CC = m256_const1_64( 0x950C3434950C3434 ); \
CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
} while (0)
#define WRITE_STATE8(state) do { \
(state)->A[0] = A00; \
(state)->A[1] = A01; \
(state)->A[2] = A02; \
(state)->A[3] = A03; \
(state)->A[4] = A04; \
(state)->A[5] = A05; \
(state)->A[6] = A06; \
(state)->A[7] = A07; \
(state)->A[8] = A08; \
(state)->A[9] = A09; \
(state)->A[10] = A0A; \
(state)->A[11] = A0B; \
(state)->B[0] = B0; \
(state)->B[1] = B1; \
(state)->B[2] = B2; \
(state)->B[3] = B3; \
(state)->B[4] = B4; \
(state)->B[5] = B5; \
(state)->B[6] = B6; \
(state)->B[7] = B7; \
(state)->B[8] = B8; \
(state)->B[9] = B9; \
(state)->B[10] = BA; \
(state)->B[11] = BB; \
(state)->B[12] = BC; \
(state)->B[13] = BD; \
(state)->B[14] = BE; \
(state)->B[15] = BF; \
(state)->C[0] = C0; \
(state)->C[1] = C1; \
(state)->C[2] = C2; \
(state)->C[3] = C3; \
(state)->C[4] = C4; \
(state)->C[5] = C5; \
(state)->C[6] = C6; \
(state)->C[7] = C7; \
(state)->C[8] = C8; \
(state)->C[9] = C9; \
(state)->C[10] = CA; \
(state)->C[11] = CB; \
(state)->C[12] = CC; \
(state)->C[13] = CD; \
(state)->C[14] = CE; \
(state)->C[15] = CF; \
(state)->Wlow = Wlow; \
(state)->Whigh = Whigh; \
} while (0)
#define DECODE_BLOCK8 \
do { \
M0 = buf[ 0]; \
M1 = buf[ 1]; \
M2 = buf[ 2]; \
M3 = buf[ 3]; \
M4 = buf[ 4]; \
M5 = buf[ 5]; \
M6 = buf[ 6]; \
M7 = buf[ 7]; \
M8 = buf[ 8]; \
M9 = buf[ 9]; \
MA = buf[10]; \
MB = buf[11]; \
MC = buf[12]; \
MD = buf[13]; \
ME = buf[14]; \
MF = buf[15]; \
} while (0)
#define INPUT_BLOCK_ADD8 \
do { \
B0 = _mm256_add_epi32( B0, M0 );\
B1 = _mm256_add_epi32( B1, M1 );\
B2 = _mm256_add_epi32( B2, M2 );\
B3 = _mm256_add_epi32( B3, M3 );\
B4 = _mm256_add_epi32( B4, M4 );\
B5 = _mm256_add_epi32( B5, M5 );\
B6 = _mm256_add_epi32( B6, M6 );\
B7 = _mm256_add_epi32( B7, M7 );\
B8 = _mm256_add_epi32( B8, M8 );\
B9 = _mm256_add_epi32( B9, M9 );\
BA = _mm256_add_epi32( BA, MA );\
BB = _mm256_add_epi32( BB, MB );\
BC = _mm256_add_epi32( BC, MC );\
BD = _mm256_add_epi32( BD, MD );\
BE = _mm256_add_epi32( BE, ME );\
BF = _mm256_add_epi32( BF, MF );\
} while (0)
#define INPUT_BLOCK_SUB8 \
do { \
C0 = _mm256_sub_epi32( C0, M0 ); \
C1 = _mm256_sub_epi32( C1, M1 ); \
C2 = _mm256_sub_epi32( C2, M2 ); \
C3 = _mm256_sub_epi32( C3, M3 ); \
C4 = _mm256_sub_epi32( C4, M4 ); \
C5 = _mm256_sub_epi32( C5, M5 ); \
C6 = _mm256_sub_epi32( C6, M6 ); \
C7 = _mm256_sub_epi32( C7, M7 ); \
C8 = _mm256_sub_epi32( C8, M8 ); \
C9 = _mm256_sub_epi32( C9, M9 ); \
CA = _mm256_sub_epi32( CA, MA ); \
CB = _mm256_sub_epi32( CB, MB ); \
CC = _mm256_sub_epi32( CC, MC ); \
CD = _mm256_sub_epi32( CD, MD ); \
CE = _mm256_sub_epi32( CE, ME ); \
CF = _mm256_sub_epi32( CF, MF ); \
} while (0)
#define XOR_W8 \
do { \
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
} while (0)
#define SWAP_BC8 \
do { \
mm256_swap512_256( B0, C0 ); \
mm256_swap512_256( B1, C1 ); \
mm256_swap512_256( B2, C2 ); \
mm256_swap512_256( B3, C3 ); \
mm256_swap512_256( B4, C4 ); \
mm256_swap512_256( B5, C5 ); \
mm256_swap512_256( B6, C6 ); \
mm256_swap512_256( B7, C7 ); \
mm256_swap512_256( B8, C8 ); \
mm256_swap512_256( B9, C9 ); \
mm256_swap512_256( BA, CA ); \
mm256_swap512_256( BB, CB ); \
mm256_swap512_256( BC, CC ); \
mm256_swap512_256( BD, CD ); \
mm256_swap512_256( BE, CE ); \
mm256_swap512_256( BF, CF ); \
} while (0)
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \
_mm256_andnot_si256( xb3, xb2 ), \
_mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
) ), _mm256_set1_epi32(3UL) ) ) ) ); \
xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
} while (0)
#define PERM_STEP_0_8 do { \
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1_8 do { \
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2_8 do { \
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P8 \
do { \
B0 = mm256_ror_32( B0, 15 ); \
B1 = mm256_ror_32( B1, 15 ); \
B2 = mm256_ror_32( B2, 15 ); \
B3 = mm256_ror_32( B3, 15 ); \
B4 = mm256_ror_32( B4, 15 ); \
B5 = mm256_ror_32( B5, 15 ); \
B6 = mm256_ror_32( B6, 15 ); \
B7 = mm256_ror_32( B7, 15 ); \
B8 = mm256_ror_32( B8, 15 ); \
B9 = mm256_ror_32( B9, 15 ); \
BA = mm256_ror_32( BA, 15 ); \
BB = mm256_ror_32( BB, 15 ); \
BC = mm256_ror_32( BC, 15 ); \
BD = mm256_ror_32( BD, 15 ); \
BE = mm256_ror_32( BE, 15 ); \
BF = mm256_ror_32( BF, 15 ); \
PERM_STEP_0_8; \
PERM_STEP_1_8; \
PERM_STEP_2_8; \
A0B = _mm256_add_epi32( A0B, C6 ); \
A0A = _mm256_add_epi32( A0A, C5 ); \
A09 = _mm256_add_epi32( A09, C4 ); \
A08 = _mm256_add_epi32( A08, C3 ); \
A07 = _mm256_add_epi32( A07, C2 ); \
A06 = _mm256_add_epi32( A06, C1 ); \
A05 = _mm256_add_epi32( A05, C0 ); \
A04 = _mm256_add_epi32( A04, CF ); \
A03 = _mm256_add_epi32( A03, CE ); \
A02 = _mm256_add_epi32( A02, CD ); \
A01 = _mm256_add_epi32( A01, CC ); \
A00 = _mm256_add_epi32( A00, CB ); \
A0B = _mm256_add_epi32( A0B, CA ); \
A0A = _mm256_add_epi32( A0A, C9 ); \
A09 = _mm256_add_epi32( A09, C8 ); \
A08 = _mm256_add_epi32( A08, C7 ); \
A07 = _mm256_add_epi32( A07, C6 ); \
A06 = _mm256_add_epi32( A06, C5 ); \
A05 = _mm256_add_epi32( A05, C4 ); \
A04 = _mm256_add_epi32( A04, C3 ); \
A03 = _mm256_add_epi32( A03, C2 ); \
A02 = _mm256_add_epi32( A02, C1 ); \
A01 = _mm256_add_epi32( A01, C0 ); \
A00 = _mm256_add_epi32( A00, CF ); \
A0B = _mm256_add_epi32( A0B, CE ); \
A0A = _mm256_add_epi32( A0A, CD ); \
A09 = _mm256_add_epi32( A09, CC ); \
A08 = _mm256_add_epi32( A08, CB ); \
A07 = _mm256_add_epi32( A07, CA ); \
A06 = _mm256_add_epi32( A06, C9 ); \
A05 = _mm256_add_epi32( A05, C8 ); \
A04 = _mm256_add_epi32( A04, C7 ); \
A03 = _mm256_add_epi32( A03, C6 ); \
A02 = _mm256_add_epi32( A02, C5 ); \
A01 = _mm256_add_epi32( A01, C4 ); \
A00 = _mm256_add_epi32( A00, C3 ); \
} while (0)
#define INCR_W8 do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
} while (0)
static void
shabal_8way_init( void *cc, unsigned size )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
if ( size == 512 )
{ // copy immediate constants directly to working registers later.
sc->state_loaded = false;
}
else
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;
sc->ptr = 0;
}
static void
shabal_8way_core( void *cc, const unsigned char *data, size_t len )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
__m256i *buf;
__m256i *vdata = (__m256i*)data;
const int buf_size = 64;
size_t ptr;
DECL_STATE8
buf = sc->buf;
ptr = sc->ptr;
if ( len < (buf_size - ptr ) )
{
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE8( sc );
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += clen>>2;
len -= clen;
if ( ptr == buf_size )
{
DECODE_BLOCK8;
INPUT_BLOCK_ADD8;
XOR_W8;
APPLY_P8;
INPUT_BLOCK_SUB8;
SWAP_BC8;
INCR_W8;
ptr = 0;
}
}
WRITE_STATE8(sc);
sc->ptr = ptr;
}
static void
shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_8way_context *sc = (shabal_8way_context*)cc;
__m256i *buf;
const int buf_size = 64;
size_t ptr;
int i;
unsigned z, zz;
DECL_STATE8
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>2] = _mm256_set1_epi32( zz );
memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
READ_STATE8(sc);
DECODE_BLOCK8;
INPUT_BLOCK_ADD8;
XOR_W8;
APPLY_P8;
for ( i = 0; i < 3; i ++ )
{
SWAP_BC8;
XOR_W8;
APPLY_P8;
}
__m256i *d = (__m256i*)dst;
if ( size_words == 16 ) // 512
{
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
}
else // 256
{
d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
}
}
void
shabal256_8way_init( void *cc )
{
shabal_8way_init(cc, 256);
}
void
shabal256_8way_update( void *cc, const void *data, size_t len )
{
shabal_8way_core( cc, data, len );
}
void
shabal256_8way_close( void *cc, void *dst )
{
shabal_8way_close(cc, 0, 0, dst, 8);
}
void
shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst )
{
shabal_8way_close(cc, ub, n, dst, 8);
}
void
shabal512_8way_init(void *cc)
{
shabal_8way_init(cc, 512);
}
void
shabal512_8way_update(void *cc, const void *data, size_t len)
{
shabal_8way_core(cc, data, len);
}
void
shabal512_8way_close(void *cc, void *dst)
{
shabal_8way_close(cc, 0, 0, dst, 16);
}
void
shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
shabal_8way_close(cc, ub, n, dst, 16);
}
#endif // AVX2
/*
* We copy the state into local variables, so that the compiler knows
* that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
} while (0)
/*
#define SWAP(v1, v2) do { \
sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
(v2) = tmp; \
} while (0)
*/
#define SWAP_BC \
do { \
mm128_swap128_256( B0, C0 ); \
mm128_swap128_256( B1, C1 ); \
mm128_swap128_256( B2, C2 ); \
mm128_swap128_256( B3, C3 ); \
mm128_swap128_256( B4, C4 ); \
mm128_swap128_256( B5, C5 ); \
mm128_swap128_256( B6, C6 ); \
mm128_swap128_256( B7, C7 ); \
mm128_swap128_256( B8, C8 ); \
mm128_swap128_256( B9, C9 ); \
mm128_swap128_256( BA, CA ); \
mm128_swap128_256( BB, CB ); \
mm128_swap128_256( BC, CC ); \
mm128_swap128_256( BD, CD ); \
mm128_swap128_256( BE, CE ); \
mm128_swap128_256( BF, CF ); \
mm128_swap256_128( B0, C0 ); \
mm128_swap256_128( B1, C1 ); \
mm128_swap256_128( B2, C2 ); \
mm128_swap256_128( B3, C3 ); \
mm128_swap256_128( B4, C4 ); \
mm128_swap256_128( B5, C5 ); \
mm128_swap256_128( B6, C6 ); \
mm128_swap256_128( B7, C7 ); \
mm128_swap256_128( B8, C8 ); \
mm128_swap256_128( B9, C9 ); \
mm128_swap256_128( BA, CA ); \
mm128_swap256_128( BB, CB ); \
mm128_swap256_128( BC, CC ); \
mm128_swap256_128( BD, CD ); \
mm128_swap256_128( BE, CE ); \
mm128_swap256_128( BF, CF ); \
} while (0)
/*
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
__m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\
_mm_set1_epi32(5UL) ) \
__m128i t2 = _mm_xor_si128( xa0, xc ); \
xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
_mm_xor_si128( t2, \
_mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
*/
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
}
void
shabal256_4way( void *cc, const void *data, size_t len )
shabal256_4way_update( void *cc, const void *data, size_t len )
{
shabal_4way_core( cc, data, len );
}
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
}
void
shabal512_4way(void *cc, const void *data, size_t len)
shabal512_4way_update(void *cc, const void *data, size_t len)
{
shabal_4way_core(cc, data, len);
}

View File

@@ -36,7 +36,7 @@
#ifndef SHABAL_HASH_4WAY_H__
#define SHABAL_HASH_4WAY_H__ 1
#ifdef __AVX2__
#ifdef __SSE4_1__
#include <stddef.h>
#include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{
#define SPH_SIZE_shabal512 512
#if defined(__AVX2__)
typedef struct {
__m256i buf[16];
__m256i A[12], B[16], C[16];
sph_u32 Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_8way_context __attribute__ ((aligned (64)));
typedef shabal_8way_context shabal256_8way_context;
typedef shabal_8way_context shabal512_8way_context;
void shabal256_8way_init( void *cc );
void shabal256_8way_update( void *cc, const void *data, size_t len );
void shabal256_8way_close( void *cc, void *dst );
void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_8way_init( void *cc );
void shabal512_8way_update( void *cc, const void *data, size_t len );
void shabal512_8way_close( void *cc, void *dst );
void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
#endif
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
typedef shabal_4way_context shabal512_4way_context;
void shabal256_4way_init( void *cc );
void shabal256_4way( void *cc, const void *data, size_t len );
void shabal256_4way_update( void *cc, const void *data, size_t len );
void shabal256_4way_close( void *cc, void *dst );
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
void shabal512_4way_init( void *cc );
void shabal512_4way( void *cc, const void *data, size_t len );
void shabal512_4way_update( void *cc, const void *data, size_t len );
#define shabal512_4way shabal512_4way_update
void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );

View File

@@ -3,6 +3,12 @@
#include <stdio.h>
// This implementation is deprecated, superseded by VAES in Icelake
// which provides HW based 4 way aes.
// It was created for AVX2 to eliminate interleaving between the
// preceding and following function.
// This code can be removed when current users have reverted to one way.
#if defined(__AVX2__)
@@ -16,8 +22,8 @@ static const uint32_t IV512[] =
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
mm256_ror1x32_128( b ), 0x88 )
_mm256_blend_epi32( mm256_ror128_32( a ), \
mm256_ror128_32( b ), 0x88 )
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
{
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( k02,
mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( k03,
mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( k10,
mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( k11,
mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( k12,
mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 3, 7, 11
k00 = _mm256_xor_si256( mm256_ror1x32_128(
k00 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
k01 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
k02 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
k03 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p1 = _mm256_xor_si256( p1, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
k10 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
k11 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = _mm256_xor_si256( mm256_ror1x32_128(
k12 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k12, zero ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
k13 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
// round 13
k00 = _mm256_xor_si256( mm256_ror1x32_128(
k00 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k00, zero ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
k01 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k01, zero ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
k02 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k02, zero ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
k03 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k03, zero ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
k10 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k10, zero ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
k11 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k11, zero ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
k13 = _mm256_xor_si256( mm256_ror128_32(
mm256_aesenc_2x128( k13, zero ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

View File

@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
void c11_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
// Serial
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
// 7 Luffa + 8 cube
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );

View File

@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
void x11_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
// Luffa + Cube
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );

View File

@@ -1,7 +1,4 @@
#include "x12-gate.h"
#if defined(X12_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -14,11 +11,223 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
//#include "algo/fugue/sph_fugue.h"
#if defined(X12_8WAY)
typedef struct {
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
} x12_8way_ctx_holder;
x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
void init_x12_8way_ctx()
{
blake512_8way_init( &x12_8way_ctx.blake );
bmw512_8way_init( &x12_8way_ctx.bmw );
init_groestl( &x12_8way_ctx.groestl, 64 );
skein512_8way_init( &x12_8way_ctx.skein );
jh512_8way_init( &x12_8way_ctx.jh );
keccak512_8way_init( &x12_8way_ctx.keccak );
luffa_4way_init( &x12_8way_ctx.luffa, 512 );
cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x12_8way_ctx.shavite );
simd_4way_init( &x12_8way_ctx.simd, 512 );
init_echo( &x12_8way_ctx.echo, 512 );
hamsi512_8way_init( &x12_8way_ctx.hamsi );
};
void x12_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x12_8way_ctx_holder ctx;
memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, state );
}
int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[16*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[49]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
__m512i *noncev = (__m512i*)vdata + 9; // aligned
int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do {
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
x12_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane<<1 ] < Htarg )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X12_4WAY)
typedef struct {
blake512_4way_context blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
x12_4way_ctx_holder ctx;
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way 64 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
intrlv_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
// Parallel 4way 64 bit
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );

View File

@@ -2,7 +2,11 @@
bool register_x12_algo( algo_gate_t* gate )
{
#if defined (X12_4WAY)
#if defined (X12_8WAY)
init_x12_8way_ctx();
gate->scanhash = (void*)&scanhash_x12_8way;
gate->hash = (void*)&x12_8way_hash;
#elif defined (X12_4WAY)
init_x12_4way_ctx();
gate->scanhash = (void*)&scanhash_x12_4way;
gate->hash = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x12;
gate->hash = (void*)&x12hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,29 +4,36 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X12_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X12_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X12_4WAY 1
#endif
bool register_x12_algo( algo_gate_t* gate );
#if defined(X12_4WAY)
#if defined(X12_8WAY)
void x12_8way_hash( void *state, const void *input );
int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x12_8way_ctx();
#elif defined(X12_4WAY)
void x12_4way_hash( void *state, const void *input );
int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x12_4way_ctx();
#endif
#else
void x12hash( void *state, const void *input );
int scanhash_x12( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x12_ctx();
#endif
#endif

View File

@@ -20,17 +20,17 @@
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
@@ -49,6 +49,11 @@ x12_ctx_holder x12_ctx;
void init_x12_ctx()
{
sph_blake512_init( &x12_ctx.blake );
sph_bmw512_init( &x12_ctx.bmw );
sph_skein512_init( &x12_ctx.skein);
sph_jh512_init( &x12_ctx.jh);
sph_keccak512_init( &x12_ctx.keccak);
#if defined(__AES__)
init_echo( &x12_ctx.echo, 512 );
init_groestl (&x12_ctx.groestl, 64 );
@@ -65,89 +70,31 @@ void init_x12_ctx()
void x12hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
x12_ctx_holder ctx;
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
// X11 algos
sph_blake512(&ctx.blake, input, 80);
sph_blake512_close(&ctx.blake, hash);
unsigned char hashbuf[128];
size_t hashptr;
sph_u64 hashctA;
sph_u64 hashctB;
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
//---blake1---
DECL_BLK;
BLK_I;
BLK_W;
BLK_C;
//---bmw2---
DECL_BMW;
BMW_I;
BMW_U;
#define M(x) sph_dec64le_aligned(data + 8 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
BMW_C;
#undef M
#undef H
#undef dH
//---groetl----
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
//---skein4---
DECL_SKN;
SKN_I;
SKN_U;
SKN_C;
//---jh5------
DECL_JH;
JH_H;
//---keccak6---
DECL_KEC;
KEC_I;
KEC_U;
KEC_C;
//--- luffa7
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hashB, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hashB, 512 );
//11---echo---
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
(const BitSequence *)hash, 512 );
@@ -156,11 +103,26 @@ void x12hash(void *output, const void *input)
sph_echo512_close(&ctx.echo, hashB);
#endif
// 12 Hamsi
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
sph_hamsi512(&ctx.hamsi, hashB, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
asm volatile ("emms");
memcpy(output, hashB, 32);
}

View File

@@ -1,7 +1,4 @@
#include "x13-gate.h"
#if defined(X13_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -14,12 +11,270 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#if defined(X13_8WAY)
typedef struct {
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
} x13_8way_ctx_holder;
x13_8way_ctx_holder x13_8way_ctx;
void init_x13_8way_ctx()
{
blake512_8way_init( &x13_8way_ctx.blake );
bmw512_8way_init( &x13_8way_ctx.bmw );
init_groestl( &x13_8way_ctx.groestl, 64 );
skein512_8way_init( &x13_8way_ctx.skein );
jh512_8way_init( &x13_8way_ctx.jh );
keccak512_8way_init( &x13_8way_ctx.keccak );
luffa_4way_init( &x13_8way_ctx.luffa, 512 );
cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13_8way_ctx.shavite );
simd_4way_init( &x13_8way_ctx.simd, 512 );
init_echo( &x13_8way_ctx.echo, 512 );
hamsi512_8way_init( &x13_8way_ctx.hamsi );
sph_fugue512_init( &x13_8way_ctx.fugue );
}
void x13_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x13_8way_ctx_holder ctx;
memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t last_nonce = max_nonce -8;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x13_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( ( hash+(i<<3) )[7] < Htarg
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X13_4WAY)
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;

View File

@@ -2,7 +2,11 @@
bool register_x13_algo( algo_gate_t* gate )
{
#if defined (X13_4WAY)
#if defined (X13_8WAY)
init_x13_8way_ctx();
gate->scanhash = (void*)&scanhash_x13_8way;
gate->hash = (void*)&x13_8way_hash;
#elif defined (X13_4WAY)
init_x13_4way_ctx();
gate->scanhash = (void*)&scanhash_x13_4way;
gate->hash = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x13;
gate->hash = (void*)&x13hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,29 +4,35 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X13_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X13_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X13_4WAY 1
#endif
bool register_x13_algo( algo_gate_t* gate );
#if defined(X13_4WAY)
#if defined(X13_8WAY)
void x13_8way_hash( void *state, const void *input );
int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13_8way_ctx();
#elif defined(X13_4WAY)
void x13_4way_hash( void *state, const void *input );
int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13_4way_ctx();
#endif
#else
void x13hash( void *state, const void *input );
int scanhash_x13( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x13_ctx();
#endif
#endif

View File

@@ -1,7 +1,4 @@
#include "x14-gate.h"
#if defined(X14_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -13,6 +10,7 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#if defined(X14_8WAY)
typedef struct {
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
} x14_8way_ctx_holder;
x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
void init_x14_8way_ctx()
{
blake512_8way_init( &x14_8way_ctx.blake );
bmw512_8way_init( &x14_8way_ctx.bmw );
init_groestl( &x14_8way_ctx.groestl, 64 );
skein512_8way_init( &x14_8way_ctx.skein );
jh512_8way_init( &x14_8way_ctx.jh );
keccak512_8way_init( &x14_8way_ctx.keccak );
luffa_4way_init( &x14_8way_ctx.luffa, 512 );
cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x14_8way_ctx.shavite );
simd_4way_init( &x14_8way_ctx.simd, 512 );
init_echo( &x14_8way_ctx.echo, 512 );
hamsi512_8way_init( &x14_8way_ctx.hamsi );
sph_fugue512_init( &x14_8way_ctx.fugue );
shabal512_8way_init( &x14_8way_ctx.shabal );
};
void x14_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x14_8way_ctx_holder ctx;
memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
// 14 Shabal, parallel 32 bit
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, state );
}
int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x14_8way_hash( hash, vdata );
pdata[19] = n;
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( hash7[ lane ] < Htarg )
{
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X14_4WAY)
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()
void x14_4way_hash( void *state, const void *input )
{
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x14_4way_ctx_holder ctx;
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
@@ -184,33 +439,25 @@ void x14_4way_hash( void *state, const void *input )
// 14 Shabal, parallel 32 bit
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, state );
}
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t hash[4*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -220,11 +467,9 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
pdata[19] = n;
uint32_t *hash7 = &(hash[7<<2]);
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
if ( hash7[ lane ] < Htarg )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
extr_lane_4x32( lane_hash, hash, lane, 256 );
@@ -235,10 +480,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
}
}
n += 4;
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}

View File

@@ -2,7 +2,11 @@
bool register_x14_algo( algo_gate_t* gate )
{
#if defined (X14_4WAY)
#if defined (X14_8WAY)
init_x14_8way_ctx();
gate->scanhash = (void*)&scanhash_x14_8way;
gate->hash = (void*)&x14_8way_hash;
#elif defined (X14_4WAY)
init_x14_4way_ctx();
gate->scanhash = (void*)&scanhash_x14_4way;
gate->hash = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x14;
gate->hash = (void*)&x14hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,20 +4,29 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X14_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X14_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X14_4WAY 1
#endif
bool register_x14_algo( algo_gate_t* gate );
#if defined(X14_4WAY)
#if defined(X14_8WAY)
void x14_8way_hash( void *state, const void *input );
int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_8way_ctx();
#elif defined(X14_4WAY)
void x14_4way_hash( void *state, const void *input );
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x14_4way_ctx();
#endif
#else
void x14hash( void *state, const void *input );
int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();
#endif
#endif

View File

@@ -1,7 +1,4 @@
#include "x15-gate.h"
#if defined(X15_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -14,6 +11,7 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,309 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#if defined(X15_8WAY)
typedef struct {
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
} x15_8way_ctx_holder;
x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
void init_x15_8way_ctx()
{
blake512_8way_init( &x15_8way_ctx.blake );
bmw512_8way_init( &x15_8way_ctx.bmw );
init_groestl( &x15_8way_ctx.groestl, 64 );
skein512_8way_init( &x15_8way_ctx.skein );
jh512_8way_init( &x15_8way_ctx.jh );
keccak512_8way_init( &x15_8way_ctx.keccak );
luffa_4way_init( &x15_8way_ctx.luffa, 512 );
cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x15_8way_ctx.shavite );
simd_4way_init( &x15_8way_ctx.simd, 512 );
init_echo( &x15_8way_ctx.echo, 512 );
hamsi512_8way_init( &x15_8way_ctx.hamsi );
sph_fugue512_init( &x15_8way_ctx.fugue );
shabal512_8way_init( &x15_8way_ctx.shabal );
sph_whirlpool_init( &x15_8way_ctx.whirlpool );
};
void x15_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x15_8way_ctx_holder ctx;
memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
// 1 Blake
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
// 5 JH
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
// 10 Simd
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
// 12 Hamsi parallel 4way 64 bit
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
// 14 Shabal, parallel 32 bit
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash4, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash5, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash6, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash7, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x15_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( ( hash+(i<<3) )[7] < Htarg )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash, mythr, i );
}
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X15_4WAY)
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()
void x15_4way_hash( void *state, const void *input )
{
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x15_4way_ctx_holder ctx;
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )
// 14 Shabal, parallel 32 bit
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_update( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -216,27 +517,18 @@ void x15_4way_hash( void *state, const void *input )
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const uint32_t last_nonce = max_nonce - 4;
__m256i *noncev = (__m256i*)vdata + 9;
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
int thr_id = mythr->id;
mm256_bswap32_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -246,18 +538,16 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( (hash+(i<<3))[7] & mask ) == 0 )
if ( ( hash+(i<<3) )[7] < Htarg )
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_lane_solution( work, hash, mythr, i );
}
n += 4;
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
*hashes_done = n - first_nonce;
return 0;
}

View File

@@ -2,7 +2,11 @@
bool register_x15_algo( algo_gate_t* gate )
{
#if defined (X15_4WAY)
#if defined (X15_8WAY)
init_x15_8way_ctx();
gate->scanhash = (void*)&scanhash_x15_8way;
gate->hash = (void*)&x15_8way_hash;
#elif defined (X15_4WAY)
init_x15_4way_ctx();
gate->scanhash = (void*)&scanhash_x15_4way;
gate->hash = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_x15;
gate->hash = (void*)&x15hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,20 +4,30 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X15_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X15_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X15_4WAY 1
#endif
bool register_x15_algo( algo_gate_t* gate );
#if defined(X15_4WAY)
#if defined(X15_8WAY)
void x15_8way_hash( void *state, const void *input );
int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_8way_ctx();
#elif defined(X15_4WAY)
void x15_4way_hash( void *state, const void *input );
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_x15_4way_ctx();
#endif
#else
void x15hash( void *state, const void *input );
int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();
#endif
#endif

View File

@@ -5,9 +5,6 @@
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -20,6 +17,7 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
union _x16r_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
} __attribute__ ((aligned (64)));
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
void x16r_8way_hash( void* output, const void* input )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
x16r_8way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
void *in4 = (void*) hash4;
void *in5 = (void*) hash5;
void *in6 = (void*) hash6;
void *in7 = (void*) hash7;
int size = 80;
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
if ( i == 0 )
blake512_8way_update( &ctx.blake, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_update( &ctx.blake, vhash, size );
}
blake512_8way_close( &ctx.blake, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)in4, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)in5, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)in6, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)in7, size<<3 );
break;
case SKEIN:
skein512_8way_init( &ctx.skein );
if ( i == 0 )
skein512_8way_update( &ctx.skein, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case JH:
jh512_8way_init( &ctx.jh );
if ( i == 0 )
jh512_8way_update( &ctx.jh, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
keccak512_8way_init( &ctx.keccak );
if ( i == 0 )
keccak512_8way_update( &ctx.keccak, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
keccak512_8way_update( &ctx.keccak, vhash, size );
}
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case LUFFA:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case CUBEHASH:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in4, size );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in5, size );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in6, size );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in7, size );
sph_shavite512_close( &ctx.shavite, hash7 );
break;
case SIMD:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
(const BitSequence*)in4, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
(const BitSequence*)in5, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
(const BitSequence*)in6, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
(const BitSequence*)in7, size<<3 );
break;
case HAMSI:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in4, size );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in5, size );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in6, size );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in7, size );
sph_fugue512_close( &ctx.fugue, hash7 );
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in4, size );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in5, size );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in6, size );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in7, size );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
break;
case SHA_512:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, size );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
}
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
memcpy( output+128, hash4, 32 );
memcpy( output+160, hash5, 32 );
memcpy( output+192, hash6, 32 );
memcpy( output+224, hash7, 32 );
}
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16r_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16R_4WAY)
union _x16r_4way_context_overlay
{
blake512_4way_context blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
};
} __attribute__ ((aligned (64)));
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
void x16r_4way_hash( void* output, const void* input )
{
uint32_t vhash[24*4] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
x16r_4way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
blake512_4way( &ctx.blake, vhash, size );
}
blake512_4way_close( &ctx.blake, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case BMW:
bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
bmw512_4way( &ctx.bmw, vhash, size );
}
bmw512_4way_close( &ctx.bmw, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
skein512_4way( &ctx.skein, vhash, size );
}
skein512_4way_close( &ctx.skein, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case JH:
jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
jh512_4way( &ctx.jh, vhash, size );
}
jh512_4way_close( &ctx.jh, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case KECCAK:
keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
keccak512_4way( &ctx.keccak, vhash, size );
}
keccak512_4way_close( &ctx.keccak, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case LUFFA:
intrlv_2x128( vhash, in0, in1, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_2x128( hash0, hash1, vhash, 512 );
dintrlv_2x128_512( hash0, hash1, vhash );
intrlv_2x128( vhash, in2, in3, size<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_2x128( hash2, hash3, vhash, 512 );
dintrlv_2x128_512( hash2, hash3, vhash );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
intrlv_2x128( vhash, in0, in1, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
dintrlv_2x128_512( hash0, hash1, vhash );
intrlv_2x128( vhash, in2, in3, size<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
dintrlv_2x128_512( hash2, hash3, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, size );
hamsi512_4way_close( &ctx.hamsi, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, size );
shabal512_4way_close( &ctx.shabal, vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, size );
sha512_4way_close( &ctx.sha512, vhash );
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
}
size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( likely( ( n < max_nonce ) && !(*restart) ) );
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce + 1;
*hashes_done = n - first_nonce;
return 0;
}

View File

@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16rv2_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16rv2_8way;
gate->hash = (void*)&x16rv2_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rv2_4way;
gate->hash = (void*)&x16rv2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rv2;
gate->hash = (void*)&x16rv2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16r_8way;
gate->hash = (void*)&x16r_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16r;
gate->hash = (void*)&x16r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
opt_target_factor = 256.0;
return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
bool register_x16rt_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16rt_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
opt_target_factor = 256.0;
return true;
};
bool register_x16rt_veil_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
#if defined (X16R_8WAY)
gate->scanhash = (void*)&scanhash_x16rt_8way;
gate->hash = (void*)&x16rt_8way_hash;
#elif defined (X16R_4WAY)
gate->scanhash = (void*)&scanhash_x16rt_4way;
gate->hash = (void*)&x16rt_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x16rt;
gate->hash = (void*)&x16rt_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->build_extraheader = (void*)&veil_build_extraheader;
opt_target_factor = 256.0;
return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&hex_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
opt_target_factor = 128.0;
return true;

View File

@@ -6,8 +6,10 @@
#include <stdint.h>
#include <unistd.h>
#if defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X16R_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X16R_4WAY 1
#endif
enum x16r_Algo {
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
bool register_hex__algo( algo_gate_t* gate );
bool register_x21s__algo( algo_gate_t* gate );
#if defined(X16R_4WAY)
#if defined(X16R_8WAY)
void x16r_8way_hash( void *state, const void *input );
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rv2_8way_hash( void *state, const void *input );
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x16rt_8way_hash( void *state, const void *input );
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X16R_4WAY)
void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void x21s_4way_hash( void *state, const void *input );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#endif
#else
void x16r_hash( void *state, const void *input );
int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void hex_hash( void *state, const void *input );
int scanhash_hex( struct work *work, uint32_t max_nonce,
#endif
#if defined(X16R_4WAY)
void x21s_4way_hash( void *state, const void *input );
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
bool x21s_4way_thread_init();
#else
void x21s_hash( void *state, const void *input );
int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +105,9 @@ bool x21s_thread_init();
#endif
void hex_hash( void *state, const void *input );
int scanhash_hex( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -1,7 +1,4 @@
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -15,6 +12,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
union _x16rt_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
} __attribute__ ((aligned (64)));
typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
void x16rt_8way_hash( void* output, const void* input )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
x16rt_8way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
void *in4 = (void*) hash4;
void *in5 = (void*) hash5;
void *in6 = (void*) hash6;
void *in7 = (void*) hash7;
int size = 80;
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
if ( i == 0 )
blake512_8way_update( &ctx.blake, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_update( &ctx.blake, vhash, size );
}
blake512_8way_close( &ctx.blake, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)in4, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)in5, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)in6, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)in7, size<<3 );
break;
case SKEIN:
skein512_8way_init( &ctx.skein );
if ( i == 0 )
skein512_8way_update( &ctx.skein, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case JH:
jh512_8way_init( &ctx.jh );
if ( i == 0 )
jh512_8way_update( &ctx.jh, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
keccak512_8way_init( &ctx.keccak );
if ( i == 0 )
keccak512_8way_update( &ctx.keccak, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
keccak512_8way_update( &ctx.keccak, vhash, size );
}
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case LUFFA:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case CUBEHASH:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in4, size );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in5, size );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in6, size );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in7, size );
sph_shavite512_close( &ctx.shavite, hash7 );
break;
case SIMD:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
(const BitSequence*)in4, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
(const BitSequence*)in5, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
(const BitSequence*)in6, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
(const BitSequence*)in7, size<<3 );
break;
case HAMSI:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in4, size );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in5, size );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in6, size );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in7, size );
sph_fugue512_close( &ctx.fugue, hash7 );
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in4, size );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in5, size );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in6, size );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in7, size );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
break;
case SHA_512:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, size );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
}
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
memcpy( output+128, hash4, 32 );
memcpy( output+160, hash5, 32 );
memcpy( output+192, hash6, 32 );
memcpy( output+224, hash7, 32 );
}
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) timeHash[8*8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16rt_getTimeHash( ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
hashOrder, ntime, timeHash );
}
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16rt_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16R_4WAY)
union _x16rt_4way_context_overlay
{
blake512_4way_context blake;

View File

@@ -5,9 +5,6 @@
* Optimized by JayDDee@github Jan 2018
*/
#include "x16r-gate.h"
#if defined (X16R_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -21,6 +18,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,477 @@
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
#if defined (X16R_8WAY)
union _x16rv2_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
sph_tiger_context tiger;
} __attribute__ ((aligned (64)));
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
void x16rv2_8way_hash( void* output, const void* input )
{
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
uint32_t hash0[24] __attribute__ ((aligned (64)));
uint32_t hash1[24] __attribute__ ((aligned (64)));
uint32_t hash2[24] __attribute__ ((aligned (64)));
uint32_t hash3[24] __attribute__ ((aligned (64)));
uint32_t hash4[24] __attribute__ ((aligned (64)));
uint32_t hash5[24] __attribute__ ((aligned (64)));
uint32_t hash6[24] __attribute__ ((aligned (64)));
uint32_t hash7[24] __attribute__ ((aligned (64)));
x16rv2_8way_context_overlay ctx;
void *in0 = (void*) hash0;
void *in1 = (void*) hash1;
void *in2 = (void*) hash2;
void *in3 = (void*) hash3;
void *in4 = (void*) hash4;
void *in5 = (void*) hash5;
void *in6 = (void*) hash6;
void *in7 = (void*) hash7;
int size = 80;
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
input, 640 );
for ( int i = 0; i < 16; i++ )
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
blake512_8way_init( &ctx.blake );
if ( i == 0 )
blake512_8way_update( &ctx.blake, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
blake512_8way_update( &ctx.blake, vhash, size );
}
blake512_8way_close( &ctx.blake, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case BMW:
bmw512_8way_init( &ctx.bmw );
if ( i == 0 )
bmw512_8way_update( &ctx.bmw, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
bmw512_8way_update( &ctx.bmw, vhash, size );
}
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case GROESTL:
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(const char*)in0, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(const char*)in1, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(const char*)in2, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(const char*)in3, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4,
(const char*)in4, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5,
(const char*)in5, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6,
(const char*)in6, size<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7,
(const char*)in7, size<<3 );
break;
case SKEIN:
skein512_8way_init( &ctx.skein );
if ( i == 0 )
skein512_8way_update( &ctx.skein, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
skein512_8way_update( &ctx.skein, vhash, size );
}
skein512_8way_close( &ctx.skein, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case JH:
jh512_8way_init( &ctx.jh );
if ( i == 0 )
jh512_8way_update( &ctx.jh, input, size );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
jh512_8way_update( &ctx.jh, vhash, size );
}
jh512_8way_close( &ctx.jh, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case KECCAK:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in4, size );
sph_tiger_close( &ctx.tiger, hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in5, size );
sph_tiger_close( &ctx.tiger, hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in6, size );
sph_tiger_close( &ctx.tiger, hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in7, size );
sph_tiger_close( &ctx.tiger, hash7 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case LUFFA:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in4, size );
sph_tiger_close( &ctx.tiger, hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in5, size );
sph_tiger_close( &ctx.tiger, hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in6, size );
sph_tiger_close( &ctx.tiger, hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in7, size );
sph_tiger_close( &ctx.tiger, hash7 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case CUBEHASH:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case SHAVITE:
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in0, size );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in1, size );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in2, size );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in3, size );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in4, size );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in5, size );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in6, size );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, in7, size );
sph_shavite512_close( &ctx.shavite, hash7 );
break;
case SIMD:
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
break;
case ECHO:
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence*)in0, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
(const BitSequence*)in1, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
(const BitSequence*)in2, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
(const BitSequence*)in3, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
(const BitSequence*)in4, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
(const BitSequence*)in5, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
(const BitSequence*)in6, size<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
(const BitSequence*)in7, size<<3 );
break;
case HAMSI:
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, size );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case FUGUE:
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in0, size );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in1, size );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in2, size );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in3, size );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in4, size );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in5, size );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in6, size );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, in7, size );
sph_fugue512_close( &ctx.fugue, hash7 );
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
size<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, size );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
case WHIRLPOOL:
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in0, size );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in1, size );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in2, size );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in3, size );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in4, size );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in5, size );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in6, size );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, in7, size );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
break;
case SHA_512:
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in0, size );
sph_tiger_close( &ctx.tiger, hash0 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in1, size );
sph_tiger_close( &ctx.tiger, hash1 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in2, size );
sph_tiger_close( &ctx.tiger, hash2 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in3, size );
sph_tiger_close( &ctx.tiger, hash3 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in4, size );
sph_tiger_close( &ctx.tiger, hash4 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in5, size );
sph_tiger_close( &ctx.tiger, hash5 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in6, size );
sph_tiger_close( &ctx.tiger, hash6 );
sph_tiger_init( &ctx.tiger );
sph_tiger( &ctx.tiger, in7, size );
sph_tiger_close( &ctx.tiger, hash7 );
for ( int i = (24/4); i < (64/4); i++ )
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
hash6, hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, vhash );
break;
}
size = 64;
}
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
memcpy( output+128, hash4, 32 );
memcpy( output+160, hash5, 32 );
memcpy( output+192, hash6, 32 );
memcpy( output+224, hash7, 32 );
}
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if ( opt_benchmark )
ptarget[7] = 0x0cff;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x16rv2_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{
pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (X16R_4WAY)
union _x16rv2_4way_context_overlay
{
blake512_4way_context blake;

File diff suppressed because it is too large Load Diff

View File

@@ -2,8 +2,10 @@
bool register_sonoa_algo( algo_gate_t* gate )
{
#if defined (SONOA_4WAY)
// init_sonoa_4way_ctx();
#if defined (SONOA_8WAY)
gate->scanhash = (void*)&scanhash_sonoa_8way;
gate->hash = (void*)&sonoa_8way_hash;
#elif defined (SONOA_4WAY)
gate->scanhash = (void*)&scanhash_sonoa_4way;
gate->hash = (void*)&sonoa_4way_hash;
#else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sonoa;
gate->hash = (void*)&sonoa_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,29 +4,33 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define SONOA_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SONOA_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define SONOA_4WAY 1
#endif
bool register_sonoa_algo( algo_gate_t* gate );
#if defined(SONOA_4WAY)
#if defined(SONOA_8WAY)
void sonoa_8way_hash( void *state, const void *input );
int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(SONOA_4WAY)
void sonoa_4way_hash( void *state, const void *input );
int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void init_sonoa_4way_ctx();
#endif
#else
void sonoa_hash( void *state, const void *input );
int scanhash_sonoa( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_ctx();
#endif
#endif

View File

@@ -1,7 +1,4 @@
#include "x17-gate.h"
#if defined(X17_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -14,6 +11,7 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#if defined(X17_8WAY)
union _x17_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
} __attribute__ ((aligned (64)));
typedef union _x17_8way_context_overlay x17_8way_context_overlay;
void x17_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t hash4[8] __attribute__ ((aligned (64)));
uint64_t hash5[8] __attribute__ ((aligned (64)));
uint64_t hash6[8] __attribute__ ((aligned (64)));
uint64_t hash7[8] __attribute__ ((aligned (64)));
x17_8way_context_overlay ctx;
// 1 Blake parallel 4 way 64 bit
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
// Serialize
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 3 Groestl
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
// Parallellize
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
// 4 Skein parallel 4 way 64 bit
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, 64 );
skein512_8way_close( &ctx.skein, vhash );
// 5 JH
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, 64 );
jh512_8way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, 64 );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
// 7 Luffa
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
// 8 Cubehash
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
// 9 Shavite
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, 64 );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, 64 );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, 64 );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, 64 );
sph_shavite512_close( &ctx.shavite, hash7 );
// 10 Simd
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, 512 );
// 12 Hamsi parallel 4 way 64 bit
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 13 Fugue serial
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, 64 );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, 64 );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, 64 );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, 64 );
sph_fugue512_close( &ctx.fugue, hash7 );
// 14 Shabal, parallel 4 way 32 bit
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, 64 );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
// 16 SHA512 parallel 64 bit
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, 64 );
sha512_8way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
rintrlv_8x64_8x32( vhash0, vhash, 512 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhash0, 64 );
haval256_5_8way_close( &ctx.haval, state );
}
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(X17_4WAY)
union _x17_4way_context_overlay
{
blake512_4way_context blake;

View File

@@ -2,14 +2,17 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_4WAY)
#if defined (X17_8WAY)
gate->scanhash = (void*)&scanhash_x17_8way;
gate->hash = (void*)&x17_8way_hash;
#elif defined (X17_4WAY)
gate->scanhash = (void*)&scanhash_x17_4way;
gate->hash = (void*)&x17_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
return true;
};

View File

@@ -4,13 +4,20 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X17_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define X17_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define X17_4WAY 1
#endif
bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_4WAY)
#if defined(X17_8WAY)
void x17_8way_hash( void *state, const void *input );
int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(X17_4WAY)
void x17_4way_hash( void *state, const void *input );
int scanhash_x17_4way( struct work *work, uint32_t max_nonce,

View File

@@ -1,7 +1,4 @@
#include "xevan-gate.h"
#if defined(XEVAN_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -15,6 +12,7 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -25,6 +23,515 @@
#include "algo/sha/sha-hash-4way.h"
#include "algo/haval/haval-hash-4way.h"
#if defined(XEVAN_8WAY)
union _xevan_8way_context_overlay
{
blake512_8way_context blake;
bmw512_8way_context bmw;
hashState_groestl groestl;
skein512_8way_context skein;
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
cube_4way_context cube;
sph_shavite512_context shavite;
simd_4way_context simd;
hashState_echo echo;
hamsi512_8way_context hamsi;
sph_fugue512_context fugue;
shabal512_8way_context shabal;
sph_whirlpool_context whirlpool;
sha512_8way_context sha512;
haval256_5_8way_context haval;
} __attribute__ ((aligned (64)));
typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
void xevan_8way_hash( void *output, const void *input )
{
uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t hash4[16] __attribute__ ((aligned (64)));
uint64_t hash5[16] __attribute__ ((aligned (64)));
uint64_t hash6[16] __attribute__ ((aligned (64)));
uint64_t hash7[16] __attribute__ ((aligned (64)));
const int dataLen = 128;
xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
memset( &vhash[8<<3], 0, 64<<3 );
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, dataLen );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, dataLen );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, dataLen );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, dataLen );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, dataLen );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, dataLen );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, dataLen );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, dataLen );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, dataLen );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, dataLen );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, vhashA );
rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
blake512_8way_init( &ctx.blake );
blake512_8way_update( &ctx.blake, vhash, dataLen );
blake512_8way_close(&ctx.blake, vhash);
bmw512_8way_init( &ctx.bmw );
bmw512_8way_update( &ctx.bmw, vhash, dataLen );
bmw512_8way_close( &ctx.bmw, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
skein512_8way_init( &ctx.skein );
skein512_8way_update( &ctx.skein, vhash, dataLen );
skein512_8way_close( &ctx.skein, vhash );
jh512_8way_init( &ctx.jh );
jh512_8way_update( &ctx.jh, vhash, dataLen );
jh512_8way_close( &ctx.jh, vhash );
keccak512_8way_init( &ctx.keccak );
keccak512_8way_update( &ctx.keccak, vhash, dataLen );
keccak512_8way_close( &ctx.keccak, vhash );
rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_4way_init( &ctx.luffa, 512 );
luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_4way_init( &ctx.cube, 512, 16, 32 );
cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash4, dataLen );
sph_shavite512_close( &ctx.shavite, hash4 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash5, dataLen );
sph_shavite512_close( &ctx.shavite, hash5 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash6, dataLen );
sph_shavite512_close( &ctx.shavite, hash6 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash7, dataLen );
sph_shavite512_close( &ctx.shavite, hash7 );
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_4way_init( &ctx.simd, 512 );
simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash4,
(const BitSequence *) hash4, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash5,
(const BitSequence *) hash5, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash6,
(const BitSequence *) hash6, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash7,
(const BitSequence *) hash7, dataLen<<3 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash4, dataLen );
sph_fugue512_close( &ctx.fugue, hash4 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash5, dataLen );
sph_fugue512_close( &ctx.fugue, hash5 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash6, dataLen );
sph_fugue512_close( &ctx.fugue, hash6 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash7, dataLen );
sph_fugue512_close( &ctx.fugue, hash7 );
intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
shabal512_8way_init( &ctx.shabal );
shabal512_8way_update( &ctx.shabal, vhash, dataLen );
shabal512_8way_close( &ctx.shabal, vhash );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
hash7, dataLen<<3 );
sha512_8way_init( &ctx.sha512 );
sha512_8way_update( &ctx.sha512, vhash, dataLen );
sha512_8way_close( &ctx.sha512, vhash );
rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
haval256_5_8way_init( &ctx.haval );
haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
haval256_5_8way_close( &ctx.haval, output );
}
int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*16] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
mm512_bswap32_intrlv80_8x64( vdata, pdata );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
xevan_8way_hash( hash, vdata );
for ( int lane = 0; lane < 8; lane++ )
if unlikely( ( hash7[ lane ] <= Htarg ) )
{
extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(XEVAN_4WAY)
union _xevan_4way_context_overlay
{
blake512_4way_context blake;

View File

@@ -2,8 +2,10 @@
bool register_xevan_algo( algo_gate_t* gate )
{
#if defined (XEVAN_4WAY)
// init_xevan_4way_ctx();
#if defined (XEVAN_8WAY)
gate->scanhash = (void*)&scanhash_xevan_8way;
gate->hash = (void*)&xevan_8way_hash;
#elif defined (XEVAN_4WAY)
gate->scanhash = (void*)&scanhash_xevan_4way;
gate->hash = (void*)&xevan_4way_hash;
#else

View File

@@ -4,13 +4,21 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define XEVAN_4WAY
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define XEVAN_8WAY 1
#elif defined(__AVX2__) && defined(__AES__)
#define XEVAN_4WAY 1
#endif
bool register_xevan_algo( algo_gate_t* gate );
#if defined(XEVAN_4WAY)
#if defined(XEVAN_8WAY)
void xevan_8way_hash( void *state, const void *input );
int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined(XEVAN_4WAY)
void xevan_4way_hash( void *state, const void *input );
@@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
//void init_xevan_4way_ctx();
#endif
#else
void xevan_hash( void *state, const void *input );
@@ -30,3 +38,4 @@ void init_xevan_ctx();
#endif
#endif

View File

@@ -4,6 +4,8 @@
# during develpment. However the information contained may provide compilation
# tips to users.
rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.10.2'
PACKAGE_STRING='cpuminer-opt 3.10.2'
PACKAGE_VERSION='3.10.5'
PACKAGE_STRING='cpuminer-opt 3.10.5'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.10.2
cpuminer-opt configure 3.10.5
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.10.2, which was
It was created by cpuminer-opt $as_me 3.10.5, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.10.2'
VERSION='3.10.5'
cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.10.2, which was
This file was extended by cpuminer-opt $as_me 3.10.5, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.10.2
cpuminer-opt config.status 3.10.5
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.10.2])
AC_INIT([cpuminer-opt], [3.10.5])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
printf(".\n");
#endif
printf("CPU features:");
printf("CPU features: ");
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha ) printf( " SHA" );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2" );
else if ( cpu_has_avx ) printf( " AVX" );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse2 ) printf( " SSE2" );
printf(".\nSW features:");
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES" );
if ( sw_has_sha ) printf( " SHA" );
printf("\nSW features: ");
if ( sw_has_avx512 ) printf( " AVX512" );
else if ( sw_has_avx2 ) printf( " AVX2" );
else if ( sw_has_avx ) printf( " AVX" );
else if ( sw_has_avx2 ) printf( " AVX2 " );
else if ( sw_has_avx ) printf( " AVX " );
else if ( sw_has_sse42 ) printf( " SSE4.2" );
else if ( sw_has_sse2 ) printf( " SSE2" );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_vaes ) printf( " VAES" );
else if ( sw_has_aes ) printf( " AES " );
if ( sw_has_sha ) printf( " SHA" );
printf(".\nAlgo features:");
printf("\nAlgo features:");
if ( algo_features == EMPTY_SET ) printf( " None" );
else
{
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES" );
if ( algo_has_sha ) printf( " SHA" );
if ( algo_has_avx512 ) printf( " AVX512" );
else if ( algo_has_avx2 ) printf( " AVX2" );
else if ( algo_has_avx2 ) printf( " AVX2 " );
else if ( algo_has_sse42 ) printf( " SSE4.2" );
else if ( algo_has_sse2 ) printf( " SSE2" );
else if ( algo_has_sse2 ) printf( " SSE2 " );
if ( algo_has_vaes ) printf( " VAES" );
else if ( algo_has_aes ) printf( " AES " );
if ( algo_has_sha ) printf( " SHA" );
}
printf(".\n");
printf("\n");
// Check for CPU and build incompatibilities
if ( !cpu_has_sse2 )
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
use_sha || use_vaes );
// Display best options
printf( "Start mining with" );
printf( "\nStarting miner with" );
if ( use_none ) printf( " no optimizations" );
else
{
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );
if ( use_avx512 ) printf( " AVX512" );
else if ( use_avx2 ) printf( " AVX2" );
else if ( use_sse42 ) printf( " SSE4.2" );
else if ( use_sse2 ) printf( " SSE2" );
if ( use_vaes ) printf( " VAES" );
else if ( use_aes ) printf( " AES" );
if ( use_sha ) printf( " SHA" );
}
printf( ".\n\n" );
printf( "...\n\n" );
return true;
}

View File

@@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
*( (uint32_t*)(d06) +(i) ) = s[ 6]; \
*( (uint32_t*)(d07) +(i) ) = s[ 7]; \
*( (uint32_t*)(d08) +(i) ) = s[ 8]; \
*( (uint32_t*)(d09) +(i) ) = s[ 0]; \
*( (uint32_t*)(d09) +(i) ) = s[ 9]; \
*( (uint32_t*)(d10) +(i) ) = s[10]; \
*( (uint32_t*)(d11) +(i) ) = s[11]; \
*( (uint32_t*)(d12) +(i) ) = s[12]; \
@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
d0[3] = s[6]; d1[3] = s[7];
}
#endif // AVX
///////////////////////////
@@ -2165,7 +2162,9 @@ static inline void rintrlv_4x32_4x64( void *dst,
d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] );
d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] );
d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] );
if ( bit_len <= 256 ) return;
d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] );
d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] );
d[10] = _mm_unpacklo_epi32( s[10], s[11] );
@@ -2174,16 +2173,21 @@ static inline void rintrlv_4x32_4x64( void *dst,
d[13] = _mm_unpackhi_epi32( s[12], s[13] );
d[14] = _mm_unpacklo_epi32( s[14], s[15] );
d[15] = _mm_unpackhi_epi32( s[14], s[15] );
if ( bit_len <= 512 ) return;
d[16] = _mm_unpacklo_epi32( s[16], s[17] );
d[17] = _mm_unpackhi_epi32( s[16], s[17] );
d[18] = _mm_unpacklo_epi32( s[18], s[19] );
d[19] = _mm_unpackhi_epi32( s[18], s[19] );
if ( bit_len <= 640 ) return;
d[20] = _mm_unpacklo_epi32( s[20], s[21] );
d[21] = _mm_unpackhi_epi32( s[20], s[21] );
d[22] = _mm_unpacklo_epi32( s[22], s[23] );
d[23] = _mm_unpackhi_epi32( s[22], s[23] );
d[24] = _mm_unpacklo_epi32( s[24], s[25] );
d[25] = _mm_unpackhi_epi32( s[24], s[25] );
d[26] = _mm_unpacklo_epi32( s[26], s[27] );
@@ -2194,6 +2198,93 @@ static inline void rintrlv_4x32_4x64( void *dst,
d[31] = _mm_unpackhi_epi32( s[30], s[31] );
}
// 8x32 -> 8x64
static inline void rintrlv_8x32_8x64( void *dst,
const void *src, const int bit_len )
{
__m128i *d = (__m128i*)dst;
const __m128i *s = (const __m128i*)src;
d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] );
d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] );
d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] );
d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] );
d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] );
d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] );
d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] );
d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] );
d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] );
d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] );
d[10] = _mm_unpacklo_epi32( s[ 9], s[11] );
d[11] = _mm_unpackhi_epi32( s[ 9], s[11] );
d[12] = _mm_unpacklo_epi32( s[12], s[14] );
d[13] = _mm_unpackhi_epi32( s[12], s[14] );
d[14] = _mm_unpacklo_epi32( s[13], s[15] );
d[15] = _mm_unpackhi_epi32( s[13], s[15] );
if ( bit_len <= 256 ) return;
d[16] = _mm_unpacklo_epi32( s[16], s[18] );
d[17] = _mm_unpackhi_epi32( s[16], s[18] );
d[18] = _mm_unpacklo_epi32( s[17], s[19] );
d[19] = _mm_unpackhi_epi32( s[17], s[19] );
d[20] = _mm_unpacklo_epi32( s[20], s[22] );
d[21] = _mm_unpackhi_epi32( s[20], s[22] );
d[22] = _mm_unpacklo_epi32( s[21], s[23] );
d[23] = _mm_unpackhi_epi32( s[21], s[23] );
d[24] = _mm_unpacklo_epi32( s[24], s[26] );
d[25] = _mm_unpackhi_epi32( s[24], s[26] );
d[26] = _mm_unpacklo_epi32( s[25], s[27] );
d[27] = _mm_unpackhi_epi32( s[25], s[27] );
d[28] = _mm_unpacklo_epi32( s[28], s[30] );
d[29] = _mm_unpackhi_epi32( s[28], s[30] );
d[30] = _mm_unpacklo_epi32( s[29], s[31] );
d[31] = _mm_unpackhi_epi32( s[29], s[31] );
if ( bit_len <= 512 ) return;
d[32] = _mm_unpacklo_epi32( s[32], s[34] );
d[33] = _mm_unpackhi_epi32( s[32], s[34] );
d[34] = _mm_unpacklo_epi32( s[33], s[35] );
d[35] = _mm_unpackhi_epi32( s[33], s[35] );
d[36] = _mm_unpacklo_epi32( s[36], s[38] );
d[37] = _mm_unpackhi_epi32( s[36], s[38] );
d[38] = _mm_unpacklo_epi32( s[37], s[39] );
d[39] = _mm_unpackhi_epi32( s[37], s[39] );
d[40] = _mm_unpacklo_epi32( s[40], s[42] );
d[41] = _mm_unpackhi_epi32( s[40], s[42] );
d[42] = _mm_unpacklo_epi32( s[41], s[43] );
d[43] = _mm_unpackhi_epi32( s[41], s[43] );
d[44] = _mm_unpacklo_epi32( s[44], s[46] );
d[45] = _mm_unpackhi_epi32( s[44], s[46] );
d[46] = _mm_unpacklo_epi32( s[45], s[47] );
d[47] = _mm_unpackhi_epi32( s[45], s[47] );
d[48] = _mm_unpacklo_epi32( s[48], s[50] );
d[49] = _mm_unpackhi_epi32( s[48], s[50] );
d[50] = _mm_unpacklo_epi32( s[49], s[51] );
d[51] = _mm_unpackhi_epi32( s[49], s[51] );
d[52] = _mm_unpacklo_epi32( s[52], s[54] );
d[53] = _mm_unpackhi_epi32( s[52], s[54] );
d[54] = _mm_unpacklo_epi32( s[53], s[55] );
d[55] = _mm_unpackhi_epi32( s[53], s[55] );
d[56] = _mm_unpacklo_epi32( s[56], s[58] );
d[57] = _mm_unpackhi_epi32( s[56], s[58] );
d[58] = _mm_unpacklo_epi32( s[57], s[59] );
d[59] = _mm_unpackhi_epi32( s[57], s[59] );
d[60] = _mm_unpacklo_epi32( s[60], s[62] );
d[61] = _mm_unpackhi_epi32( s[60], s[62] );
d[62] = _mm_unpacklo_epi32( s[61], s[63] );
d[63] = _mm_unpackhi_epi32( s[61], s[63] );
}
/*
#define RLEAVE_4x32_4x64(i) do \
{ \
@@ -2225,7 +2316,6 @@ static inline void rintrlv_4x32_4x64( void *dst,
// 2x128 -> 4x64
static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
@@ -2268,7 +2358,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
}
/*
#define RLEAVE_2x128_4x64( i ) do \
{ \
@@ -2339,7 +2428,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
}
/*
#define RLEAVE_4x64_2x128( i ) do \
{ \
@@ -2364,6 +2452,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
}
*/
// 2x128 -> 8x64
static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
__m128i *d = (__m128i*)dst;
const __m128i *s0 = (const __m128i*)src0;
const __m128i *s1 = (const __m128i*)src1;
d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
if ( bit_len <= 256 ) return;
d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
if ( bit_len <= 512 ) return;
d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
}
// 8x64 -> 4x128
static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
const void *src, const int bit_len )
{
__m128i *d0 = (__m128i*)dst0;
__m128i *d1 = (__m128i*)dst1;
const __m128i* s = (const __m128i*)src;
d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
if ( bit_len <= 256 ) return;
d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
if ( bit_len <= 512 ) return;
d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
}
// 8x64 -> 2x256
static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
__m128i *d0 = (__m128i*)dst0;
__m128i *d1 = (__m128i*)dst1;
__m128i *d2 = (__m128i*)dst2;
__m128i *d3 = (__m128i*)dst3;
const __m128i* s = (const __m128i*)src;
d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] );
d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] );
d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
if ( bit_len <= 256 ) return;
d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
if ( bit_len <= 512 ) return;
d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
}
// 4x128 -> 8x64
static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, const int bit_len )
{
__m128i *d = (__m128i*)dst;
__m128i *s0 = (__m128i*)src0;
__m128i *s1 = (__m128i*)src1;
__m128i *s2 = (__m128i*)src2;
__m128i *s3 = (__m128i*)src3;
d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
if ( bit_len <= 256 ) return;
d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
if ( bit_len <= 512 ) return;
d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
}
//
// Some functions customized for mining.

View File

@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#else
#define mm128_ror_64 mm128_ror_var_64
#define mm128_rol_64 mm128_rol_var_64
#define mm128_ror_32 mm128_ror_var_32
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
// Rotate 16 byte (128 bit) vector by c bytes.
// Less efficient using shift but more versatile. Use only for odd number
// byte rotations. Use shuffle above whenever possible.
#define mm128_ror_x8( v, c ) \
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
#define mm128_rol_x8( v, c ) \
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
#if defined (__SSE3__)
// no SSE2 implementation, no current users
@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_1x8( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
0x060504030201000f ) )
#endif // SSE3
#else // SSE2
// Rotate 16 byte (128 bit) vector by c bytes.
// Less efficient using shift but more versatile. Use only for odd number
// byte rotations. Use shuffle above whenever possible.
#define mm128_bror( v, c ) \
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
#define mm128_ror_1x16( v ) \
_mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
#define mm128_brol( v, c ) \
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
#define mm128_rol_1x16( v ) \
_mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
#define mm128_ror_1x8( v ) \
_mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
#define mm128_rol_1x8( v ) \
_mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
#endif // SSE3 else SSE2
// Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Rotate elements within lanes.
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_swap_64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_ror16_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
0x0100070605040302 )
#define mm128_rol64_8( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
_mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
#define mm128_rol16_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706 )
#define mm128_ror64_8( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
_mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
#define mm128_rol32_8( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
_mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
#define mm128_ror32_8( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
_mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
#define mm128_swap16_32( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
0x0504070601000302 )
//
// Endian byte swap.
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
// Swap 128 bit vectorse.
#define mm128_swap128_256( v1, v2 ) \
#define mm128_swap256_128( v1, v2 ) \
v1 = _mm_xor_si128( v1, v2 ); \
v2 = _mm_xor_si128( v1, v2 ); \
v1 = _mm_xor_si128( v1, v2 );
// Concatenate v1 & v2 and rotate as one 256 bit vector.
#if defined(__SSE4_1__)
#define mm128_ror1x64_256( v1, v2 ) \
#define mm128_ror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_rol1x64_256( v1, v2 ) \
#define mm128_rol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#define mm128_ror1x32_256( v1, v2 ) \
#define mm128_ror256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm128_rol1x32_256( v1, v2 ) \
#define mm128_rol256_32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_ror1x16_256( v1, v2 ) \
#define mm128_ror256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm128_rol1x16_256( v1, v2 ) \
#define mm128_rol256_16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_ror1x8_256( v1, v2 ) \
#define mm128_ror256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm128_rol1x8_256( v1, v2 ) \
#define mm128_rol256_8( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -497,7 +515,7 @@ do { \
#else // SSE2
#define mm128_ror1x64_256( v1, v2 ) \
#define mm128_ror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
@@ -506,7 +524,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol1x64_256( v1, v2 ) \
#define mm128_rol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
@@ -515,7 +533,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror1x32_256( v1, v2 ) \
#define mm128_ror256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) ); \
@@ -524,7 +542,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol1x32_256( v1, v2 ) \
#define mm128_rol256_32( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) ); \
@@ -533,7 +551,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror1x16_256( v1, v2 ) \
#define mm128_ror256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) ); \
@@ -542,7 +560,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol1x16_256( v1, v2 ) \
#define mm128_rol256_16( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) ); \
@@ -551,7 +569,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_ror1x8_256( v1, v2 ) \
#define mm128_ror256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) ); \
@@ -560,7 +578,7 @@ do { \
v1 = t; \
} while(0)
#define mm128_rol1x8_256( v1, v2 ) \
#define mm128_rol256_8( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) ); \

View File

@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Rotate elements within lanes of 256 bit vector.
// Rotate elements within each 128 bit lane of 256 bit vector.
// Swap 64 bit elements in each 128 bit lane.
#define mm256_swap64_128( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
// Rotate each 128 bit lane by one 32 bit element.
#define mm256_ror1x32_128( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_ror1x16_128( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
#define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_rol1x16_128( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
0x0d0c0b0a09080706, 0x0504030201000f0e ) )
#define mm256_ror1x8_128( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
#define mm256_rol1x8_128( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
// Rotate each 128 bit lane by c bytes.
#define mm256_bror_128( v, c ) \
// Rotave each 128 bit lane by c elements.
#define mm256_ror128_8( v, c ) \
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
_mm256_bslli_epi128( v, 16-(c) ) )
#define mm256_brol_128( v, c ) \
#define mm256_rol128_8( v, c ) \
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
_mm256_bsrli_epi128( v, 16-(c) ) )
// Swap 32 bit elements in each 64 bit lane
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
// Rotate elements in each 64 bit lane
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm256_rol1x16_64( v ) _mm256_rol_epi64( v, 16 )
#define mm256_ror1x16_64( v ) _mm256_ror_epi64( v, 16 )
#define mm256_rol64_8( v, c ) _mm256_rol_epi64( v, ((c)<<3) )
#define mm256_ror64_8( v, c ) _mm256_ror_epi64( v, ((c)<<3) )
#else
#define mm256_ror1x16_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#define mm256_rol64_8( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
_mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
#define mm256_ror64_8( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
_mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
#define mm256_rol1x16_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
#endif
#define mm256_ror1x8_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
#define mm256_rol1x8_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
#define mm256_ror3x8_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#define mm256_rol3x8_64( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
// Swap 16 bit elements in each 32 bit lane
// Rotate elements in each 32 bit lane
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm256_swap16_32( v ) _mm256_rol_epi32( v, 16 )
#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
#define mm256_rol32_8( v ) _mm256_rol_epi32( v, 8 )
#define mm256_ror32_8( v ) _mm256_ror_epi32( v, 8 )
#else
#define mm256_swap16_32( v ) \
_mm256_shuffle_epi8( v, \
m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
#define mm256_swap32_16( v ) \
_mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
_mm256_srli_epi32( v, 16 ) )
#define mm256_rol32_8( v ) \
_mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
_mm256_srli_epi32( v, 8 ) )
#define mm256_ror32_8( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
_mm256_slli_epi32( v, 8 ) )
#endif
//
// Swap bytes in vector elements, endian bswap.
#define mm256_bswap_64( v ) \
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
#define mm256_swap256_512 (v1, v2) \
v1 = _mm256_xor_si256(v1, v2); \
v2 = _mm256_xor_si256(v1, v2); \
v1 = _mm256_xor_si256(v1, v2);
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \
v1 = _mm256_xor_si256( v1, v2 );
#define mm256_ror1x128_512( v1, v2 ) \
#define mm256_ror512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \
} while(0)
#define mm256_rol1x128_512( v1, v2 ) \
#define mm256_rol512_128( v1, v2 ) \
do { \
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \

View File

@@ -15,13 +15,13 @@
// AVX512 intrinsics have a few changes from previous conventions.
//
// Some instructions like cmp and blend use the mask regsiters now instead
// a vector mask.
// cmp instruction now returns a bitmask isnstead of a vector mask.
// This eliminates the need for the blendv instruction.
//
// The new rotate instructions require the count to be only an 8 bit
// immediate value. The documentation is the same as for shift and
// it allows variables. Suspect a compiler issue but it still happens
// in GCC9.
// The new rotate instructions require the count to be an 8 bit
// immediate value only. Compilation fails if a variable is used.
// The documentation is the same as for shift and it works with
// variables.
//
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
// usually shuffles accross all lanes.
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
#define m512_const2_64( i1, i0 ) \
m512_const1_128( m128_const_64( i1, i0 ) )
#define m512_const2_32( i1, i0 ) \
m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
| ( (uint64_t)(i0) & 0xffffffff ) ) )
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{
@@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ))
0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
#define mm512_bswap_32( v ) \
_mm512_shuffle_epi8( v, \
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
{ \
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x0c0d0e0f08090a0b, 0x0405060700010203, \
0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Rotate elements in 512 bit vector.
#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 )
// 1x64 notation used to disinguish from bit rotation.
#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 )
#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 )
@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Rotate elements within 256 bit lanes of 512 bit vector.
// Rename these for consistency. Element size is always last.
// mm<vectorsize>_<op><lanesize>_<elementsize>
// Swap hi & lo 128 bits in each 256 bit lane
#define mm512_swap128_256( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
// Rotate 256 bit lanes by one 64 bit element
#define mm512_ror1x64_256( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 )
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
// Rotate 256 bit lanes by one 32 bit element
#define mm512_ror1x32_256( v ) \
#define mm512_ror256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x000000080000000f, 0x0000000e0000000d, \
0x0000000c0000000b, 0x0000000a00000009, \
0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001 ), v )
#define mm512_rol1x32_256( v ) \
#define mm512_rol256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
0x0000000e0000000d, 0x0000000c0000000b, \
0x0000000a00000009, 0x000000080000000f, \
0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ), v )
#define mm512_ror1x16_256( v ) \
#define mm512_ror256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x00100001001e001d, 0x001c001b001a0019, \
0x0018001700160015, 0x0014001300120011, \
0x0000000f000e000d, 0x000c000b000a0009, \
0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_256( v ) \
#define mm512_rol256_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001b, 0x001a001900180017, \
0x0016001500140013, 0x001200110010001f, \
0x000e000d000c000b, 0x000a000900080007, \
0x0006000500040003, 0x000200010000000f ), v )
#define mm512_ror1x8_256( v ) \
#define mm512_ror256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x203f3e3d3c3b3a39, 0x3837363534333231, \
0x302f2e2d2c2b2a29, 0x2827262524232221, \
0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
#define mm512_rol1x8_256( v ) \
#define mm512_rol256_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3e3d3c3b3a393837, 0x363534333231302f, \
0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// Rotate elements within 128 bit lanes of 512 bit vector.
// Swap hi & lo 64 bits in each 128 bit lane
#define mm512_swap64_128( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
// Rotate 128 bit lanes by one 32 bit element
#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_ror1x16_128( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x0018001f001e001d, 0x001c001b001a0019, \
0x0010001700160015, 0x0014001300120011, \
0x0008000f000e000d, 0x000c000b000a0009, \
0x0000000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_128( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001b, 0x001a00190018001f, \
0x0016001500140013, 0x0012001100100017, \
0x000e000d000c000b, 0x000a00090008000f, \
0x0006000500040003, 0x0002000100000007 ), v )
#define mm512_ror1x8_128( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x303f3e3d3c3b3a39, 0x3837363534333231, \
0x202f2e2d2c2b2a29, 0x2827262524232221, \
0x101f1e1d1c1b1a19, 0x1817161514131211, \
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
#define mm512_rol1x8_128( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3e3d3c3b3a393837, 0x363534333231303f, \
0x2e2d2c2b2a292827, 0x262524232221202f, \
0x1e1d1c1b1a191817, 0x161514131211101f, \
0x0e0d0c0b0a090807, 0x060504030201000f ) )
// Rotate 128 bit lanes by c bytes.
#define mm512_bror_128( v, c ) \
// Rotate 128 bit lanes by c bytes, faster than building that monstrous
// constant above.
#define mm512_ror128_8( v, c ) \
_mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
_mm512_bslli_epi128( v, 16-(c) ) )
#define mm512_brol_128( v, c ) \
#define mm512_rol128_8( v, c ) \
_mm512_or_si512( _mm512_bslli_epi128( v, c ), \
_mm512_bsrli_epi128( v, 16-(c) ) )
@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
//
// Rotate elements within 64 bit lanes.
#define mm512_rol64_x8( v, c ) _mm512_rol_epi64( v, ((c)<<3) )
#define mm512_ror64_x8( v, c ) _mm512_ror_epi64( v, ((c)<<3) )
// Swap 32 bit elements in each 64 bit lane
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
// Rotate each 64 bit lane by one 16 bit element.
#define mm512_ror1x16_64( v ) _mm512_ror_epi64( v, 16 )
#define mm512_rol1x16_64( v ) _mm512_rol_epi64( v, 16 )
#define mm512_ror1x8_64( v ) _mm512_ror_epi64( v, 8 )
#define mm512_rol1x8_64( v ) _mm512_rol_epi64( v, 8 )
/*
#define mm512_ror1x16_64( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001c001f001e001d, 0x0018001b001a0019, \
0x0014001700160015, 0x0010001300120011, \
0x000c000f000e000d, 0x0008000b000a0009, \
0x0004000700060005, 0x0000000300020001, v )
#define mm512_rol1x16_64( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001d001c001f, 0x001a00190018001b, \
0x0016001500140017, 0x0012001100100013, \
0x000e000d000c000f, 0x000a00090008000b, \
0x0006000500040007, 0x0002000100000003, v )
// Rotate each 64 bit lane by one byte.
#define mm512_ror1x8_64( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x383F3E3D3C3B3A39, 0x3037363534333231, \
0x282F2E2D2C2B2A29, 0x2027262524232221, \
0x181F1E1D1C1B1A19, 0x1017161514131211, \
0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
#define mm512_rol1x8_64( v ) \
_mm512_shuffle( v, m512_const_64( \
0x3E3D3C3B3A39383F, 0x3635343332313037, \
0x2E2D2C2B2A29282F, 0x2625242322212027, \
0x1E1D1C1B1A19181F, 0x1615141312111017, \
0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
*/
#define mm512_ror64_16( v ) _mm512_ror_epi64( v, 16 )
#define mm512_rol64_16( v ) _mm512_rol_epi64( v, 16 )
#define mm512_ror64_8( v ) _mm512_ror_epi64( v, 8 )
#define mm512_rol64_8( v ) _mm512_rol_epi64( v, 8 )
//
// Rotate elements within 32 bit lanes.
#define mm512_swap16_32( v ) _mm512_ror_epi32( v, 16 )
#define mm512_ror1x8_32( v ) _mm512_ror_epi32( v, 8 )
#define mm512_rol1x8_32( v ) _mm512_rol_epi32( v, 8 )
/*
#define mm512_swap16_32( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001e001f001c001d, 0x001a001b00180019, \
0x0016001700140015, 0x0012001300100011, \
0x000e000f000c000d, 0x000a000b00080009, \
0x0006000700040005, 0x0002000300000001 ), v )
#define mm512_ror1x8_32( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3C3F3E3D383B3A39, 0x3437363530333231, \
0x2C2F2E2D282B2A29, 0x2427262520232221, \
0x1C1F1E1D181B1A19, 0x1417161510131211, \
0x0C0F0E0D080B0A09, 0x0407060500030201 ))
#define mm512_rol1x8_32( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
0x3E3D3C3F3A39383B, 0x3635343732313033, \
0x2E2D2C2F2A29282B, 0x2625242722212023, \
0x1E1D1C1F1A19181B, 0x1615141712111013, \
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
*/
#define mm512_rol32_x8( v, c ) _mm512_rol_epi32( v, ((c)<<2) )
#define mm512_ror32_x8( v, c ) _mm512_ror_epi32( v, ((c)<<2) )
//
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// These can all be done with 2 permutex2var instructions but they are
// slower than either xor or alignr and require AVX512VBMI.
#define mm512_swap512_1024(v1, v2) \
#define mm512_swap1024_512(v1, v2) \
v1 = _mm512_xor_si512(v1, v2); \
v2 = _mm512_xor_si512(v1, v2); \
v1 = _mm512_xor_si512(v1, v2);
#define mm512_ror1x256_1024( v1, v2 ) \
#define mm512_ror1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm512_rol1x256_1024( v1, v2 ) \
#define mm512_rol1024_256( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
v1 = t; \
} while(0)
#define mm512_ror1x128_1024( v1, v2 ) \
#define mm512_ror1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_rol1x128_1024( v1, v2 ) \
#define mm512_rol1024_128( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
v1 = t; \
} while(0)
#define mm512_ror1x64_1024( v1, v2 ) \
#define mm512_ror1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1x64_1024( v1, v2 ) \
#define mm512_rol1024_64( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
v1 = t; \
} while(0)
#define mm512_ror1x32_1024( v1, v2 ) \
#define mm512_ror1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1x32_1024( v1, v2 ) \
#define mm512_rol1024_32( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \