mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d741f1c9a9 |
@@ -1,12 +1,14 @@
|
|||||||
|
|
||||||
|
|
||||||
Requirements:
|
1. Requirements:
|
||||||
|
---------------
|
||||||
|
|
||||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||||
supported.
|
supported.
|
||||||
64 bit Linux operating system. Apple is not supported.
|
64 bit Linux operating system. Apple is not supported.
|
||||||
|
|
||||||
Building on linux prerequisites:
|
2. Building on linux prerequisites:
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
It is assumed users know how to install packages on their system and
|
It is assumed users know how to install packages on their system and
|
||||||
be able to compile standard source packages. This is basic Linux and
|
be able to compile standard source packages. This is basic Linux and
|
||||||
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
|
|||||||
|
|
||||||
Install any additional dependencies needed by cpuminer-opt. The list below
|
Install any additional dependencies needed by cpuminer-opt. The list below
|
||||||
are some of the ones that may not be in the default install and need to
|
are some of the ones that may not be in the default install and need to
|
||||||
be installed manually. There may be others, read the error messages they
|
be installed manually. There may be others, read the compiler error messages,
|
||||||
will give a clue as to the missing package.
|
they will give a clue as to the missing package.
|
||||||
|
|
||||||
The following command should install everything you need on Debian based
|
The following command should install everything you need on Debian based
|
||||||
distributions such as Ubuntu. Fedora and other distributions may have similar
|
distributions such as Ubuntu. Fedora and other distributions may have similar
|
||||||
but different package names.
|
but different package names.
|
||||||
|
|
||||||
sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
|
$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
|
||||||
|
|
||||||
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
||||||
openssl 1.1.0e or higher. Add one of the following, depending on the
|
openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
|
||||||
compiler version, to CFLAGS:
|
support depending on your CPU and compiler version:
|
||||||
"-march=native" or "-march=znver1" or "-msha".
|
|
||||||
|
"-march=native" is always the best choice
|
||||||
|
|
||||||
|
"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
|
||||||
|
|
||||||
|
"-msha" Add SHA to other tuning options
|
||||||
|
|
||||||
Additional instructions for static compilalation can be found here:
|
Additional instructions for static compilalation can be found here:
|
||||||
https://lxadm.com/Static_compilation_of_cpuminer
|
https://lxadm.com/Static_compilation_of_cpuminer
|
||||||
Static builds should only considered in a homogeneous HW and SW environment.
|
Static builds should only considered in a homogeneous HW and SW environment.
|
||||||
Local builds will always have the best performance and compatibility.
|
Local builds will always have the best performance and compatibility.
|
||||||
|
|
||||||
Extract cpuminer source.
|
3. Download cpuminer-opt
|
||||||
|
------------------------
|
||||||
|
|
||||||
tar xvzf cpuminer-opt-x.y.z.tar.gz
|
Download the source code for the latest realease from the official repository.
|
||||||
cd cpuminer-opt-x.y.z
|
|
||||||
|
|
||||||
Run ./build.sh to build on Linux or execute the following commands.
|
https://github.com/JayDDee/cpuminer-opt/releases
|
||||||
|
|
||||||
./autogen.sh
|
Extract the source code.
|
||||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
|
||||||
make
|
|
||||||
|
|
||||||
Start mining.
|
$ tar xvzf cpuminer-opt-x.y.z.tar.gz
|
||||||
|
|
||||||
|
|
||||||
|
Alternatively it can be cloned from git.
|
||||||
|
|
||||||
|
$ git clone https://github.com/JayDDee/cpuminer-opt.git
|
||||||
|
|
||||||
|
4. Build cpuminer-opt
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
It is recomended to Build with default options, this will usuallly
|
||||||
|
produce the best results.
|
||||||
|
|
||||||
|
$ ./build.sh to build on Linux or execute the following commands.
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
$ ./autogen.sh
|
||||||
|
$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||||
|
$ make -j n
|
||||||
|
|
||||||
|
n is the number of threads.
|
||||||
|
|
||||||
|
5. Start mining.
|
||||||
|
----------------
|
||||||
|
|
||||||
|
$ ./cpuminer -a algo -o url -u username -p password
|
||||||
|
|
||||||
./cpuminer -a algo -o url -u username -p password
|
|
||||||
|
|
||||||
Windows
|
Windows
|
||||||
|
-------
|
||||||
|
|
||||||
|
See also INSTAL_WINDOWS
|
||||||
|
|
||||||
|
The following procedure is obsolete and uses an old compiler.
|
||||||
|
|
||||||
Precompiled Windows binaries are built on a Linux host using Mingw
|
Precompiled Windows binaries are built on a Linux host using Mingw
|
||||||
with a more recent compiler than the following Windows hosted procedure.
|
with a more recent compiler than the following Windows hosted procedure.
|
||||||
|
@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
|
|||||||
algo/luffa/luffa-hash-2way.c \
|
algo/luffa/luffa-hash-2way.c \
|
||||||
algo/lyra2/lyra2.c \
|
algo/lyra2/lyra2.c \
|
||||||
algo/lyra2/sponge.c \
|
algo/lyra2/sponge.c \
|
||||||
|
algo/lyra2/sponge-2way.c \
|
||||||
|
algo/lyra2/lyra2-hash-2way.c \
|
||||||
algo/lyra2/lyra2-gate.c \
|
algo/lyra2/lyra2-gate.c \
|
||||||
algo/lyra2/lyra2rev2.c \
|
algo/lyra2/lyra2rev2.c \
|
||||||
algo/lyra2/lyra2rev2-4way.c \
|
algo/lyra2/lyra2rev2-4way.c \
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
cpuminer-opt is a console program run from the command line using the
|
cpuminer-opt is a console program run from the command line using the
|
||||||
keyboard, not the mouse.
|
keyboard, not the mouse.
|
||||||
|
|
||||||
|
See also README.md for list of supported algorithms,
|
||||||
|
|
||||||
Security warning
|
Security warning
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
@@ -31,7 +33,16 @@ not supported. FreeBSD YMMV.
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
v3.10.2
|
v3.10.4
|
||||||
|
|
||||||
|
AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
|
||||||
|
|
||||||
|
v3.10.3
|
||||||
|
|
||||||
|
AVX512 for x12, x13, x14, x15.
|
||||||
|
Fixed x12 AVX2 invalid shares.
|
||||||
|
|
||||||
|
v.10.2
|
||||||
|
|
||||||
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
||||||
Fixed c11 AVX2 invalid shares.
|
Fixed c11 AVX2 invalid shares.
|
||||||
|
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
|||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
#define rotr32 mm256_swap32_64
|
#define rotr32( x ) mm256_ror_64( x, 32 )
|
||||||
#define rotr24 mm256_ror3x8_64
|
#define rotr24( x ) mm256_ror_64( x, 24 )
|
||||||
#define rotr16 mm256_ror1x16_64
|
#define rotr16( x ) mm256_ror_64( x, 16 )
|
||||||
#define rotr63( x ) mm256_rol_64( x, 1 )
|
#define rotr63( x ) mm256_rol_64( x, 1 )
|
||||||
|
|
||||||
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
||||||
//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||||
|
@@ -70,19 +70,22 @@ typedef struct {
|
|||||||
// Default 14 rounds
|
// Default 14 rounds
|
||||||
typedef blake_4way_small_context blake256_4way_context;
|
typedef blake_4way_small_context blake256_4way_context;
|
||||||
void blake256_4way_init(void *ctx);
|
void blake256_4way_init(void *ctx);
|
||||||
void blake256_4way(void *ctx, const void *data, size_t len);
|
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||||
|
#define blake256_4way blake256_4way_update
|
||||||
void blake256_4way_close(void *ctx, void *dst);
|
void blake256_4way_close(void *ctx, void *dst);
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_4way_small_context blake256r14_4way_context;
|
typedef blake_4way_small_context blake256r14_4way_context;
|
||||||
void blake256r14_4way_init(void *cc);
|
void blake256r14_4way_init(void *cc);
|
||||||
void blake256r14_4way(void *cc, const void *data, size_t len);
|
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||||
|
#define blake256r14_4way blake256r14_4way_update
|
||||||
void blake256r14_4way_close(void *cc, void *dst);
|
void blake256r14_4way_close(void *cc, void *dst);
|
||||||
|
|
||||||
// 8 rounds, blakecoin, vanilla
|
// 8 rounds, blakecoin, vanilla
|
||||||
typedef blake_4way_small_context blake256r8_4way_context;
|
typedef blake_4way_small_context blake256r8_4way_context;
|
||||||
void blake256r8_4way_init(void *cc);
|
void blake256r8_4way_init(void *cc);
|
||||||
void blake256r8_4way(void *cc, const void *data, size_t len);
|
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||||
|
#define blake256r8_4way blake256r8_4way_update
|
||||||
void blake256r8_4way_close(void *cc, void *dst);
|
void blake256r8_4way_close(void *cc, void *dst);
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
@@ -100,19 +103,21 @@ typedef struct {
|
|||||||
// Default 14 rounds
|
// Default 14 rounds
|
||||||
typedef blake_8way_small_context blake256_8way_context;
|
typedef blake_8way_small_context blake256_8way_context;
|
||||||
void blake256_8way_init(void *cc);
|
void blake256_8way_init(void *cc);
|
||||||
void blake256_8way(void *cc, const void *data, size_t len);
|
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||||
|
#define blake256_8way blake256_8way_update
|
||||||
void blake256_8way_close(void *cc, void *dst);
|
void blake256_8way_close(void *cc, void *dst);
|
||||||
|
|
||||||
// 14 rounds, blake, decred
|
// 14 rounds, blake, decred
|
||||||
typedef blake_8way_small_context blake256r14_8way_context;
|
typedef blake_8way_small_context blake256r14_8way_context;
|
||||||
void blake256r14_8way_init(void *cc);
|
void blake256r14_8way_init(void *cc);
|
||||||
void blake256r14_8way(void *cc, const void *data, size_t len);
|
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||||
void blake256r14_8way_close(void *cc, void *dst);
|
void blake256r14_8way_close(void *cc, void *dst);
|
||||||
|
|
||||||
// 8 rounds, blakecoin, vanilla
|
// 8 rounds, blakecoin, vanilla
|
||||||
typedef blake_8way_small_context blake256r8_8way_context;
|
typedef blake_8way_small_context blake256r8_8way_context;
|
||||||
void blake256r8_8way_init(void *cc);
|
void blake256r8_8way_init(void *cc);
|
||||||
void blake256r8_8way(void *cc, const void *data, size_t len);
|
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||||
|
#define blake256r8_8way blake256r8_8way_update
|
||||||
void blake256r8_8way_close(void *cc, void *dst);
|
void blake256r8_8way_close(void *cc, void *dst);
|
||||||
|
|
||||||
// Blake-512 4 way
|
// Blake-512 4 way
|
||||||
|
@@ -634,7 +634,7 @@ do { \
|
|||||||
m256_const1_64( 0x082EFA98082EFA98 ) ); \
|
m256_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||||
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
||||||
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||||
shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
|
shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||||
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||||
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_16way_close_update(void *cc, void *dst)
|
blake256_16way_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
}
|
}
|
||||||
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256_8way(void *cc, const void *data, size_t len)
|
blake256_8way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_4way(void *cc, const void *data, size_t len)
|
blake256r14_4way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_4way(cc, data, len);
|
blake32_4way(cc, data, len);
|
||||||
}
|
}
|
||||||
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r14_8way(void *cc, const void *data, size_t len)
|
blake256r14_8way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_4way(void *cc, const void *data, size_t len)
|
blake256r8_4way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_4way(cc, data, len);
|
blake32_4way(cc, data, len);
|
||||||
}
|
}
|
||||||
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
blake256r8_8way(void *cc, const void *data, size_t len)
|
blake256r8_8way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
blake32_8way(cc, data, len);
|
blake32_8way(cc, data, len);
|
||||||
}
|
}
|
||||||
|
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
|
|||||||
x1 = _mm512_xor_si512( x1, x5 );
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
x4 = mm512_swap64_128( x4 );
|
x4 = mm512_swap128_64( x4 );
|
||||||
x5 = mm512_swap64_128( x5 );
|
x5 = mm512_swap128_64( x5 );
|
||||||
x6 = mm512_swap64_128( x6 );
|
x6 = mm512_swap128_64( x6 );
|
||||||
x7 = mm512_swap64_128( x7 );
|
x7 = mm512_swap128_64( x7 );
|
||||||
x4 = _mm512_add_epi32( x0, x4 );
|
x4 = _mm512_add_epi32( x0, x4 );
|
||||||
x5 = _mm512_add_epi32( x1, x5 );
|
x5 = _mm512_add_epi32( x1, x5 );
|
||||||
x6 = _mm512_add_epi32( x2, x6 );
|
x6 = _mm512_add_epi32( x2, x6 );
|
||||||
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
|
|||||||
x1 = _mm512_xor_si512( x1, x5 );
|
x1 = _mm512_xor_si512( x1, x5 );
|
||||||
x2 = _mm512_xor_si512( x2, x6 );
|
x2 = _mm512_xor_si512( x2, x6 );
|
||||||
x3 = _mm512_xor_si512( x3, x7 );
|
x3 = _mm512_xor_si512( x3, x7 );
|
||||||
x4 = mm512_swap32_64( x4 );
|
x4 = mm512_swap64_32( x4 );
|
||||||
x5 = mm512_swap32_64( x5 );
|
x5 = mm512_swap64_32( x5 );
|
||||||
x6 = mm512_swap32_64( x6 );
|
x6 = mm512_swap64_32( x6 );
|
||||||
x7 = mm512_swap32_64( x7 );
|
x7 = mm512_swap64_32( x7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
||||||
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
|
|||||||
x1 = _mm256_xor_si256( x1, x5 );
|
x1 = _mm256_xor_si256( x1, x5 );
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
x2 = _mm256_xor_si256( x2, x6 );
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
x4 = mm256_swap64_128( x4 );
|
x4 = mm256_swap128_64( x4 );
|
||||||
x5 = mm256_swap64_128( x5 );
|
x5 = mm256_swap128_64( x5 );
|
||||||
x6 = mm256_swap64_128( x6 );
|
x6 = mm256_swap128_64( x6 );
|
||||||
x7 = mm256_swap64_128( x7 );
|
x7 = mm256_swap128_64( x7 );
|
||||||
x4 = _mm256_add_epi32( x0, x4 );
|
x4 = _mm256_add_epi32( x0, x4 );
|
||||||
x5 = _mm256_add_epi32( x1, x5 );
|
x5 = _mm256_add_epi32( x1, x5 );
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
x6 = _mm256_add_epi32( x2, x6 );
|
||||||
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
|
|||||||
x1 = _mm256_xor_si256( x1, x5 );
|
x1 = _mm256_xor_si256( x1, x5 );
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
x2 = _mm256_xor_si256( x2, x6 );
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
x3 = _mm256_xor_si256( x3, x7 );
|
||||||
x4 = mm256_swap32_64( x4 );
|
x4 = mm256_swap64_32( x4 );
|
||||||
x5 = mm256_swap32_64( x5 );
|
x5 = mm256_swap64_32( x5 );
|
||||||
x6 = mm256_swap32_64( x6 );
|
x6 = mm256_swap64_32( x6 );
|
||||||
x7 = mm256_swap32_64( x7 );
|
x7 = mm256_swap64_32( x7 );
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||||
|
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
|
|||||||
x1 = mm256_rol_32( y0, 7 );
|
x1 = mm256_rol_32( y0, 7 );
|
||||||
x0 = _mm256_xor_si256( x0, x2 );
|
x0 = _mm256_xor_si256( x0, x2 );
|
||||||
x1 = _mm256_xor_si256( x1, x3 );
|
x1 = _mm256_xor_si256( x1, x3 );
|
||||||
x2 = mm256_swap64_128( x2 );
|
x2 = mm256_swap128_64( x2 );
|
||||||
x3 = mm256_swap64_128( x3 );
|
x3 = mm256_swap128_64( x3 );
|
||||||
x2 = _mm256_add_epi32( x0, x2 );
|
x2 = _mm256_add_epi32( x0, x2 );
|
||||||
x3 = _mm256_add_epi32( x1, x3 );
|
x3 = _mm256_add_epi32( x1, x3 );
|
||||||
y0 = mm256_swap_128( x0 );
|
y0 = mm256_swap_128( x0 );
|
||||||
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
|
|||||||
x1 = mm256_rol_32( y1, 11 );
|
x1 = mm256_rol_32( y1, 11 );
|
||||||
x0 = _mm256_xor_si256( x0, x2 );
|
x0 = _mm256_xor_si256( x0, x2 );
|
||||||
x1 = _mm256_xor_si256( x1, x3 );
|
x1 = _mm256_xor_si256( x1, x3 );
|
||||||
x2 = mm256_swap32_64( x2 );
|
x2 = mm256_swap64_32( x2 );
|
||||||
x3 = mm256_swap32_64( x3 );
|
x3 = mm256_swap64_32( x3 );
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm256_store_si256( (__m256i*)sp->x, x0 );
|
_mm256_store_si256( (__m256i*)sp->x, x0 );
|
||||||
|
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
|
|||||||
SPH_C32(0xe7e00a94) }
|
SPH_C32(0xe7e00a94) }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define s0 m0
|
||||||
|
#define s1 c0
|
||||||
|
#define s2 m1
|
||||||
|
#define s3 c1
|
||||||
|
#define s4 c2
|
||||||
|
#define s5 m2
|
||||||
|
#define s6 c3
|
||||||
|
#define s7 m3
|
||||||
|
#define s8 m4
|
||||||
|
#define s9 c4
|
||||||
|
#define sA m5
|
||||||
|
#define sB c5
|
||||||
|
#define sC c6
|
||||||
|
#define sD m6
|
||||||
|
#define sE c7
|
||||||
|
#define sF m7
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// Hamsi 8 way
|
||||||
|
|
||||||
|
#define INPUT_BIG8 \
|
||||||
|
do { \
|
||||||
|
__m512i db = *buf; \
|
||||||
|
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
|
||||||
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
|
||||||
|
for ( int u = 0; u < 64; u++ ) \
|
||||||
|
{ \
|
||||||
|
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
|
||||||
|
dm = mm512_negate_32( _mm512_or_si512( dm, \
|
||||||
|
_mm512_slli_epi64( dm, 32 ) ) ); \
|
||||||
|
m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[0] ) ) ); \
|
||||||
|
m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[1] ) ) ); \
|
||||||
|
m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[2] ) ) ); \
|
||||||
|
m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[3] ) ) ); \
|
||||||
|
m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[4] ) ) ); \
|
||||||
|
m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[5] ) ) ); \
|
||||||
|
m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[6] ) ) ); \
|
||||||
|
m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
|
||||||
|
m512_const1_64( tp[7] ) ) ); \
|
||||||
|
tp += 8; \
|
||||||
|
db = _mm512_srli_epi64( db, 1 ); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define SBOX8( a, b, c, d ) \
|
||||||
|
do { \
|
||||||
|
__m512i t; \
|
||||||
|
t = a; \
|
||||||
|
a = _mm512_and_si512( a, c ); \
|
||||||
|
a = _mm512_xor_si512( a, d ); \
|
||||||
|
c = _mm512_xor_si512( c, b ); \
|
||||||
|
c = _mm512_xor_si512( c, a ); \
|
||||||
|
d = _mm512_or_si512( d, t ); \
|
||||||
|
d = _mm512_xor_si512( d, b ); \
|
||||||
|
t = _mm512_xor_si512( t, c ); \
|
||||||
|
b = d; \
|
||||||
|
d = _mm512_or_si512( d, t ); \
|
||||||
|
d = _mm512_xor_si512( d, a ); \
|
||||||
|
a = _mm512_and_si512( a, b ); \
|
||||||
|
t = _mm512_xor_si512( t, a ); \
|
||||||
|
b = _mm512_xor_si512( b, d ); \
|
||||||
|
b = _mm512_xor_si512( b, t ); \
|
||||||
|
a = c; \
|
||||||
|
c = b; \
|
||||||
|
b = d; \
|
||||||
|
d = mm512_not( t ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define L8( a, b, c, d ) \
|
||||||
|
do { \
|
||||||
|
a = mm512_rol_32( a, 13 ); \
|
||||||
|
c = mm512_rol_32( c, 3 ); \
|
||||||
|
b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
|
||||||
|
d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
|
||||||
|
_mm512_slli_epi32( a, 3 ) ) ); \
|
||||||
|
b = mm512_rol_32( b, 1 ); \
|
||||||
|
d = mm512_rol_32( d, 7 ); \
|
||||||
|
a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
|
||||||
|
c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
|
||||||
|
_mm512_slli_epi32( b, 7 ) ) ); \
|
||||||
|
a = mm512_rol_32( a, 5 ); \
|
||||||
|
c = mm512_rol_32( c, 22 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define DECL_STATE_BIG8 \
|
||||||
|
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||||
|
|
||||||
|
#define READ_STATE_BIG8(sc) \
|
||||||
|
do { \
|
||||||
|
c0 = sc->h[0x0]; \
|
||||||
|
c1 = sc->h[0x1]; \
|
||||||
|
c2 = sc->h[0x2]; \
|
||||||
|
c3 = sc->h[0x3]; \
|
||||||
|
c4 = sc->h[0x4]; \
|
||||||
|
c5 = sc->h[0x5]; \
|
||||||
|
c6 = sc->h[0x6]; \
|
||||||
|
c7 = sc->h[0x7]; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define WRITE_STATE_BIG8(sc) \
|
||||||
|
do { \
|
||||||
|
sc->h[0x0] = c0; \
|
||||||
|
sc->h[0x1] = c1; \
|
||||||
|
sc->h[0x2] = c2; \
|
||||||
|
sc->h[0x3] = c3; \
|
||||||
|
sc->h[0x4] = c4; \
|
||||||
|
sc->h[0x5] = c5; \
|
||||||
|
sc->h[0x6] = c6; \
|
||||||
|
sc->h[0x7] = c7; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
#define ROUND_BIG8(rc, alpha) \
|
||||||
|
do { \
|
||||||
|
__m512i t0, t1, t2, t3; \
|
||||||
|
s0 = _mm512_xor_si512( s0, m512_const1_64( \
|
||||||
|
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||||
|
s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||||
|
s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||||
|
s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||||
|
s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||||
|
s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||||
|
s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||||
|
s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||||
|
s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||||
|
s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||||
|
sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||||
|
sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||||
|
sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||||
|
sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||||
|
sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||||
|
sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||||
|
\
|
||||||
|
SBOX8( s0, s4, s8, sC ); \
|
||||||
|
SBOX8( s1, s5, s9, sD ); \
|
||||||
|
SBOX8( s2, s6, sA, sE ); \
|
||||||
|
SBOX8( s3, s7, sB, sF ); \
|
||||||
|
\
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
|
||||||
|
_mm512_bslli_epi128( s5, 4 ) ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sE, 4 ) ); \
|
||||||
|
L8( s0, t1, s9, t3 ); \
|
||||||
|
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||||
|
s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||||
|
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||||
|
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||||
|
\
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||||
|
_mm512_bslli_epi128( s6, 4 ) ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sF, 4 ) ); \
|
||||||
|
L8( s1, t1, sA, t3 ); \
|
||||||
|
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||||
|
s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||||
|
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||||
|
sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||||
|
\
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
|
||||||
|
_mm512_bslli_epi128( s7, 4 ) ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sC, 4 ) ); \
|
||||||
|
L8( s2, t1, sB, t3 ); \
|
||||||
|
s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||||
|
s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||||
|
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||||
|
sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||||
|
\
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
|
||||||
|
_mm512_bslli_epi128( s4, 4 ) ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||||
|
L8( s3, t1, s8, t3 ); \
|
||||||
|
s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||||
|
s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||||
|
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||||
|
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||||
|
\
|
||||||
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
||||||
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sB, 4 ) ); \
|
||||||
|
L8( t0, t1, t2, t3 ); \
|
||||||
|
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
||||||
|
s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
|
||||||
|
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
||||||
|
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
||||||
|
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
|
||||||
|
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
||||||
|
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
|
||||||
|
sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
|
||||||
|
\
|
||||||
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
|
||||||
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
|
||||||
|
_mm512_bslli_epi128( sD, 4 ) ); \
|
||||||
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
|
||||||
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
|
||||||
|
L8( t0, t1, t2, t3 ); \
|
||||||
|
s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
|
||||||
|
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
|
||||||
|
s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
|
||||||
|
sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
|
||||||
|
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
|
||||||
|
sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
|
||||||
|
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
|
||||||
|
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define P_BIG8 \
|
||||||
|
do { \
|
||||||
|
ROUND_BIG8(0, alpha_n); \
|
||||||
|
ROUND_BIG8(1, alpha_n); \
|
||||||
|
ROUND_BIG8(2, alpha_n); \
|
||||||
|
ROUND_BIG8(3, alpha_n); \
|
||||||
|
ROUND_BIG8(4, alpha_n); \
|
||||||
|
ROUND_BIG8(5, alpha_n); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PF_BIG8 \
|
||||||
|
do { \
|
||||||
|
ROUND_BIG8( 0, alpha_f); \
|
||||||
|
ROUND_BIG8( 1, alpha_f); \
|
||||||
|
ROUND_BIG8( 2, alpha_f); \
|
||||||
|
ROUND_BIG8( 3, alpha_f); \
|
||||||
|
ROUND_BIG8( 4, alpha_f); \
|
||||||
|
ROUND_BIG8( 5, alpha_f); \
|
||||||
|
ROUND_BIG8( 6, alpha_f); \
|
||||||
|
ROUND_BIG8( 7, alpha_f); \
|
||||||
|
ROUND_BIG8( 8, alpha_f); \
|
||||||
|
ROUND_BIG8( 9, alpha_f); \
|
||||||
|
ROUND_BIG8(10, alpha_f); \
|
||||||
|
ROUND_BIG8(11, alpha_f); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define T_BIG8 \
|
||||||
|
do { /* order is important */ \
|
||||||
|
c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
|
||||||
|
c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
|
||||||
|
c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
|
||||||
|
c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
|
||||||
|
c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
|
||||||
|
c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
|
||||||
|
c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
|
||||||
|
c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
||||||
|
{
|
||||||
|
DECL_STATE_BIG8
|
||||||
|
uint32_t tmp = num << 6;
|
||||||
|
|
||||||
|
sc->count_low = SPH_T32( sc->count_low + tmp );
|
||||||
|
sc->count_high += (sph_u32)( (num >> 13) >> 13 );
|
||||||
|
if ( sc->count_low < tmp )
|
||||||
|
sc->count_high++;
|
||||||
|
|
||||||
|
READ_STATE_BIG8( sc );
|
||||||
|
while ( num-- > 0 )
|
||||||
|
{
|
||||||
|
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||||
|
|
||||||
|
INPUT_BIG8;
|
||||||
|
P_BIG8;
|
||||||
|
T_BIG8;
|
||||||
|
buf++;
|
||||||
|
}
|
||||||
|
WRITE_STATE_BIG8( sc );
|
||||||
|
}
|
||||||
|
|
||||||
|
void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
|
||||||
|
{
|
||||||
|
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||||
|
DECL_STATE_BIG8
|
||||||
|
READ_STATE_BIG8( sc );
|
||||||
|
INPUT_BIG8;
|
||||||
|
PF_BIG8;
|
||||||
|
T_BIG8;
|
||||||
|
WRITE_STATE_BIG8( sc );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void hamsi512_8way_init( hamsi_8way_big_context *sc )
|
||||||
|
{
|
||||||
|
sc->partial_len = 0;
|
||||||
|
sc->count_high = sc->count_low = 0;
|
||||||
|
|
||||||
|
sc->h[0] = m512_const1_64( 0x6c70617273746565 );
|
||||||
|
sc->h[1] = m512_const1_64( 0x656e62656b204172 );
|
||||||
|
sc->h[2] = m512_const1_64( 0x302c206272672031 );
|
||||||
|
sc->h[3] = m512_const1_64( 0x3434362c75732032 );
|
||||||
|
sc->h[4] = m512_const1_64( 0x3030312020422d33 );
|
||||||
|
sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
|
||||||
|
sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
|
||||||
|
sc->h[7] = m512_const1_64( 0x6769756d2042656c );
|
||||||
|
}
|
||||||
|
|
||||||
|
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
|
||||||
|
size_t len )
|
||||||
|
{
|
||||||
|
__m512i *vdata = (__m512i*)data;
|
||||||
|
|
||||||
|
hamsi_8way_big( sc, vdata, len>>3 );
|
||||||
|
vdata += ( (len& ~(size_t)7) >> 3 );
|
||||||
|
len &= (size_t)7;
|
||||||
|
memcpy_512( sc->buf, vdata, len>>3 );
|
||||||
|
sc->partial_len = len;
|
||||||
|
}
|
||||||
|
|
||||||
|
void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||||
|
{
|
||||||
|
__m512i pad[1];
|
||||||
|
int ch, cl;
|
||||||
|
|
||||||
|
sph_enc32be( &ch, sc->count_high );
|
||||||
|
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
|
||||||
|
pad[0] = _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
|
||||||
|
cl, ch, cl, ch, cl, ch, cl, ch );
|
||||||
|
// pad[0] = m512_const2_32( cl, ch );
|
||||||
|
sc->buf[0] = m512_const1_64( 0x80 );
|
||||||
|
hamsi_8way_big( sc, sc->buf, 1 );
|
||||||
|
hamsi_8way_big_final( sc, pad );
|
||||||
|
|
||||||
|
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
|
|
||||||
|
// Hamsi 4 way
|
||||||
|
|
||||||
#define INPUT_BIG \
|
#define INPUT_BIG \
|
||||||
do { \
|
do { \
|
||||||
@@ -627,6 +967,7 @@ do { \
|
|||||||
sc->h[0x7] = c7; \
|
sc->h[0x7] = c7; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
/*
|
||||||
#define s0 m0
|
#define s0 m0
|
||||||
#define s1 c0
|
#define s1 c0
|
||||||
#define s2 m1
|
#define s2 m1
|
||||||
@@ -643,42 +984,28 @@ do { \
|
|||||||
#define sD m6
|
#define sD m6
|
||||||
#define sE c7
|
#define sE c7
|
||||||
#define sF m7
|
#define sF m7
|
||||||
|
*/
|
||||||
|
|
||||||
#define ROUND_BIG(rc, alpha) \
|
#define ROUND_BIG(rc, alpha) \
|
||||||
do { \
|
do { \
|
||||||
__m256i t0, t1, t2, t3; \
|
__m256i t0, t1, t2, t3; \
|
||||||
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
s0 = _mm256_xor_si256( s0, m256_const1_64( \
|
||||||
( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
|
( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
|
||||||
s1 = _mm256_xor_si256( s1, m256_const1_64( \
|
s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
|
||||||
( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
|
s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
|
||||||
s2 = _mm256_xor_si256( s2, m256_const1_64( \
|
s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
|
||||||
( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
|
s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
|
||||||
s3 = _mm256_xor_si256( s3, m256_const1_64( \
|
s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
|
||||||
( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
|
s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
|
||||||
s4 = _mm256_xor_si256( s4, m256_const1_64( \
|
s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
|
||||||
( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
|
s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
|
||||||
s5 = _mm256_xor_si256( s5, m256_const1_64( \
|
s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
|
||||||
( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
|
sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
|
||||||
s6 = _mm256_xor_si256( s6, m256_const1_64( \
|
sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
|
||||||
( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
|
sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
|
||||||
s7 = _mm256_xor_si256( s7, m256_const1_64( \
|
sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
|
||||||
( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
|
sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
|
||||||
s8 = _mm256_xor_si256( s8, m256_const1_64( \
|
sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
|
||||||
( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
|
|
||||||
s9 = _mm256_xor_si256( s9, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
|
|
||||||
sA = _mm256_xor_si256( sA, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
|
|
||||||
sB = _mm256_xor_si256( sB, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
|
|
||||||
sC = _mm256_xor_si256( sC, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
|
|
||||||
sD = _mm256_xor_si256( sD, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
|
|
||||||
sE = _mm256_xor_si256( sE, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
|
|
||||||
sF = _mm256_xor_si256( sF, m256_const1_64( \
|
|
||||||
( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
|
|
||||||
\
|
\
|
||||||
SBOX( s0, s4, s8, sC ); \
|
SBOX( s0, s4, s8, sC ); \
|
||||||
SBOX( s1, s5, s9, sD ); \
|
SBOX( s1, s5, s9, sD ); \
|
||||||
|
@@ -60,9 +60,32 @@ typedef struct {
|
|||||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||||
|
|
||||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||||
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
|
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||||
|
size_t len );
|
||||||
|
#define hamsi512_4way hamsi512_4way_update
|
||||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m512i h[8];
|
||||||
|
__m512i buf[1];
|
||||||
|
size_t partial_len;
|
||||||
|
sph_u32 count_high, count_low;
|
||||||
|
} hamsi_8way_big_context;
|
||||||
|
|
||||||
|
typedef hamsi_8way_big_context hamsi512_8way_context;
|
||||||
|
|
||||||
|
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
||||||
|
void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
||||||
|
size_t len );
|
||||||
|
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
|
|||||||
{
|
{
|
||||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
|
||||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||||
|
int size = ROW_LEN_BYTES * 4; // nRows;
|
||||||
|
|
||||||
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
#if defined(LYRA2REV3_16WAY)
|
||||||
|
// l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
|
||||||
|
l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
|
||||||
|
init_lyra2rev3_16way_ctx();;
|
||||||
|
#else
|
||||||
l2v3_wholeMatrix = _mm_malloc( size, 64 );
|
l2v3_wholeMatrix = _mm_malloc( size, 64 );
|
||||||
#if defined (LYRA2REV3_8WAY)
|
#if defined (LYRA2REV3_8WAY)
|
||||||
init_lyra2rev3_8way_ctx();;
|
init_lyra2rev3_8way_ctx();;
|
||||||
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
|
|||||||
init_lyra2rev3_4way_ctx();;
|
init_lyra2rev3_4way_ctx();;
|
||||||
#else
|
#else
|
||||||
init_lyra2rev3_ctx();
|
init_lyra2rev3_ctx();
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
return l2v3_wholeMatrix;
|
return l2v3_wholeMatrix;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool register_lyra2rev3_algo( algo_gate_t* gate )
|
bool register_lyra2rev3_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (LYRA2REV3_8WAY)
|
#if defined(LYRA2REV3_16WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_lyra2rev3_16way;
|
||||||
|
gate->hash = (void*)&lyra2rev3_16way_hash;
|
||||||
|
#elif defined (LYRA2REV3_8WAY)
|
||||||
gate->scanhash = (void*)&scanhash_lyra2rev3_8way;
|
gate->scanhash = (void*)&scanhash_lyra2rev3_8way;
|
||||||
gate->hash = (void*)&lyra2rev3_8way_hash;
|
gate->hash = (void*)&lyra2rev3_8way_hash;
|
||||||
#elif defined (LYRA2REV3_4WAY)
|
#elif defined (LYRA2REV3_4WAY)
|
||||||
@@ -69,6 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_lyra2rev3;
|
gate->scanhash = (void*)&scanhash_lyra2rev3;
|
||||||
gate->hash = (void*)&lyra2rev3_hash;
|
gate->hash = (void*)&lyra2rev3_hash;
|
||||||
#endif
|
#endif
|
||||||
|
// gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
|
||||||
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
|
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
|
@@ -5,18 +5,29 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "lyra2.h"
|
#include "lyra2.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
#define LYRA2REV3_16WAY 1
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
*/
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
#define LYRA2REV3_8WAY
|
#define LYRA2REV3_8WAY 1
|
||||||
#endif
|
#elif defined(__SSE2__)
|
||||||
|
#define LYRA2REV3_4WAY 1
|
||||||
#if defined(__SSE2__)
|
|
||||||
#define LYRA2REV3_4WAY
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern __thread uint64_t* l2v3_wholeMatrix;
|
extern __thread uint64_t* l2v3_wholeMatrix;
|
||||||
|
|
||||||
bool register_lyra2rev3_algo( algo_gate_t* gate );
|
bool register_lyra2rev3_algo( algo_gate_t* gate );
|
||||||
#if defined(LYRA2REV3_8WAY)
|
|
||||||
|
#if defined(LYRA2REV3_16WAY)
|
||||||
|
|
||||||
|
void lyra2rev3_16way_hash( void *state, const void *input );
|
||||||
|
int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
bool init_lyra2rev3_16way_ctx();
|
||||||
|
|
||||||
|
#elif defined(LYRA2REV3_8WAY)
|
||||||
|
|
||||||
void lyra2rev3_8way_hash( void *state, const void *input );
|
void lyra2rev3_8way_hash( void *state, const void *input );
|
||||||
int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
@@ -46,6 +46,7 @@
|
|||||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if 0
|
||||||
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||||
const uint64_t timeCost, const uint64_t nRows,
|
const uint64_t timeCost, const uint64_t nRows,
|
||||||
@@ -216,29 +217,55 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// This version is currently only used by REv3 and has some hard coding
|
||||||
|
// specific to v3 such as input data size of 32 bytes.
|
||||||
|
//
|
||||||
|
// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
|
||||||
|
// they can be merged.
|
||||||
|
//
|
||||||
|
// RE is used by RE, allium. The main difference between RE and REv2
|
||||||
|
// in the matrix size.
|
||||||
|
//
|
||||||
|
// Z also needs to support 80 byte input as well as 32 byte, and odd
|
||||||
|
// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
|
|
||||||
// 2 way 256
|
// 2 way 256
|
||||||
// drop salt, salt len arguments, hard code some others.
|
// drop salt, salt len arguments, hard code some others.
|
||||||
// Data is interleaved 2x256.
|
// Data is interleaved 2x256.
|
||||||
|
|
||||||
|
//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||||
|
// const void *pwd, uint64_t pwdlen, uint64_t timeCost,
|
||||||
|
// uint64_t nRows, uint64_t nCols )
|
||||||
|
|
||||||
|
// hard coded for 32 byte input as well as matrix size.
|
||||||
|
// Other required versions include 80 byte input and different block
|
||||||
|
// sizez
|
||||||
|
|
||||||
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||||
const void *pwd, const uint64_t pwdlen, const void *salt,
|
const void *pwd, const uint64_t pwdlen, const void *salt,
|
||||||
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
|
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
|
||||||
const uint64_t nCols )
|
const uint64_t nCols )
|
||||||
{
|
{
|
||||||
//====================== Basic variables ============================//
|
//====================== Basic variables ============================//
|
||||||
uint64_t _ALIGN(256) state[16];
|
uint64_t _ALIGN(256) state[32];
|
||||||
int64_t row = 2; //index of row to be processed
|
int64_t row = 2;
|
||||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
int64_t prev = 1;
|
||||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
int64_t rowa0 = 0;
|
||||||
int64_t tau; //Time Loop iterator
|
int64_t rowa1 = 0;
|
||||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
int64_t tau;
|
||||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
int64_t step = 1;
|
||||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
int64_t window = 2;
|
||||||
|
int64_t gap = 1;
|
||||||
// int64_t i; //auxiliary iteration counter
|
// int64_t i; //auxiliary iteration counter
|
||||||
int64_t v64; // 64bit var for memcpy
|
// int64_t v64; // 64bit var for memcpy
|
||||||
uint64_t instance0 = 0; // Seperate instance for each lane
|
uint64_t instance0 = 0;
|
||||||
uint64_t instance1 = 0;
|
uint64_t instance1 = 0;
|
||||||
//====================================================================/
|
//====================================================================/
|
||||||
|
|
||||||
@@ -248,7 +275,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
uint64_t *ptrWord = wholeMatrix;
|
uint64_t *ptrWord = wholeMatrix;
|
||||||
|
|
||||||
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
||||||
// need to build in parallel:
|
// need to build in parallel as pw isalready interleaved.
|
||||||
|
|
||||||
|
|
||||||
// { password, (64 or 80 bytes)
|
// { password, (64 or 80 bytes)
|
||||||
// salt, (64 or 80 bytes) = same as password
|
// salt, (64 or 80 bytes) = same as password
|
||||||
// Klen, (u64) = 32 bytes
|
// Klen, (u64) = 32 bytes
|
||||||
@@ -262,16 +291,45 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
// 1 (byte)
|
// 1 (byte)
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
// It's all u64 so don't use byte
|
||||||
|
|
||||||
|
|
||||||
|
// input is usually 32 maybe 64, both are aligned to 256 bit vector.
|
||||||
|
// 80 byte inpput is not aligned complicating matters for lyra2z.
|
||||||
|
|
||||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||||
|
|
||||||
|
uint64_t *ptr = wholeMatrix;
|
||||||
|
uint64_t *pw = (uint64_t*)pwd;
|
||||||
|
|
||||||
byte *ptrByte = (byte*) wholeMatrix;
|
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||||
|
ptr += pwdlen>>2;
|
||||||
|
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||||
|
ptr += pwdlen>>2;
|
||||||
|
|
||||||
|
// now build the rest interleaving on the fly.
|
||||||
|
|
||||||
//Prepends the password
|
ptr[0] = ptr[ 4] = kLen;
|
||||||
memcpy(ptrByte, pwd, pwdlen);
|
ptr[1] = ptr[ 5] = pwdlen;
|
||||||
ptrByte += pwdlen;
|
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||||
|
ptr[3] = ptr[ 7] = timeCost;
|
||||||
|
ptr[8] = ptr[12] = nRows;
|
||||||
|
ptr[9] = ptr[13] = nCols;
|
||||||
|
ptr[10] = ptr[14] = 0x80;
|
||||||
|
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||||
|
|
||||||
|
ptr = wholeMatrix;
|
||||||
|
|
||||||
|
/*
|
||||||
|
// do it the old way to compare.
|
||||||
|
|
||||||
|
uint64_t pb[512];
|
||||||
|
byte* ptrByte = (byte*)pb;
|
||||||
|
|
||||||
|
//Prepends the password (use salt for testing)
|
||||||
|
memcpy( ptrByte, salt, saltlen );
|
||||||
|
ptrByte += saltlen;
|
||||||
|
|
||||||
//Concatenates the salt
|
//Concatenates the salt
|
||||||
memcpy(ptrByte, salt, saltlen);
|
memcpy(ptrByte, salt, saltlen);
|
||||||
@@ -280,55 +338,259 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||||
- (saltlen + pwdlen) );
|
- (saltlen + pwdlen) );
|
||||||
|
|
||||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
memcpy(ptrByte, &kLen, 8);
|
||||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
ptrByte += 8;
|
||||||
ptrByte += sizeof(uint64_t);
|
memcpy(ptrByte, &pwdlen, 8);
|
||||||
v64 = pwdlen;
|
ptrByte += 8;
|
||||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
memcpy(ptrByte, &saltlen, 8);
|
||||||
ptrByte += sizeof(uint64_t);
|
ptrByte += 8;
|
||||||
v64 = saltlen;
|
memcpy(ptrByte, &timeCost, 8);
|
||||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
ptrByte += 8;
|
||||||
ptrByte += sizeof(uint64_t);
|
memcpy(ptrByte, &nRows, 8);
|
||||||
v64 = timeCost;
|
ptrByte += 8;
|
||||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
memcpy(ptrByte, &nCols, 8);
|
||||||
ptrByte += sizeof(uint64_t);
|
ptrByte += 8;
|
||||||
v64 = nRows;
|
|
||||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
|
||||||
ptrByte += sizeof(uint64_t);
|
|
||||||
v64 = nCols;
|
|
||||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
|
||||||
ptrByte += sizeof(uint64_t);
|
|
||||||
|
|
||||||
//Now comes the padding
|
//Now comes the padding
|
||||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
ptrByte = (byte*) pb; //resets the pointer to the start of the memory matrix
|
||||||
|
|
||||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
// display the data
|
||||||
|
printf("LYRA2REV3 data, blocks= %d\n", nBlocksInput);
|
||||||
|
/*
|
||||||
|
uint64_t* m = (uint64_t*)wholeMatrix;
|
||||||
|
|
||||||
|
printf("Lyra2v3 1: blocklensafe %d\n", BLOCK_LEN_BLAKE2_SAFE_BYTES);
|
||||||
|
printf("pb: %016lx %016lx %016lx %016lx\n",pb[0],pb[1],pb[2],pb[3]);
|
||||||
|
printf("pb: %016lx %016lx %016lx %016lx\n",pb[4],pb[5],pb[6],pb[7]);
|
||||||
|
printf("pb: %016lx %016lx %016lx %016lx\n",pb[8],pb[8],pb[10],pb[11]);
|
||||||
|
printf("pb: %016lx %016lx %016lx %016lx\n",pb[12],pb[13],pb[14],pb[15]);
|
||||||
|
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[0],m[1],m[2],m[3]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[4],m[5],m[6],m[7]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[8],m[8],m[10],m[11]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[12],m[13],m[14],m[15]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[16],m[17],m[18],m[19]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[20],m[21],m[22],m[23]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[24],m[25],m[26],m[27]);
|
||||||
|
printf("data V: %016lx %016lx %016lx %016lx\n",m[28],m[29],m[30],m[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
// from here on it's all simd acces to state and matrix
|
// from here on it's all simd acces to state and matrix
|
||||||
// define vector pointers and adjust sizes and pointer offsets
|
// define vector pointers and adjust sizes and pointer offsets
|
||||||
|
|
||||||
|
uint64_t _ALIGN(256) st[16];
|
||||||
|
|
||||||
|
|
||||||
ptrWord = wholeMatrix;
|
ptrWord = wholeMatrix;
|
||||||
|
|
||||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
|
|
||||||
|
|
||||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
uint64_t *p = wholeMatrix;
|
||||||
|
printf("wholematrix[0]\n");
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[2*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[1]\n");
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[4*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[2]\n");
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[6*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[3]\n");
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV1 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
|
||||||
|
//printf("SV1: %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
absorbBlockBlake2Safe( st, pb, nBlocksInput, BLOCK_LEN );
|
||||||
|
|
||||||
|
|
||||||
|
printf("SV: %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SS: %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
|
||||||
|
|
||||||
|
// At this point the entire matrix should be filled but only col 0 is.
|
||||||
|
// The others are unchanged or the display offsets are wrong.
|
||||||
|
|
||||||
|
p = wholeMatrix;
|
||||||
|
printf("wholematrix[0] %x\n",wholeMatrix);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[32],p[33],p[34],p[35]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[36],p[37],p[38],p[39]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[40],p[41],p[42],p[43]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[44],p[45],p[46],p[47]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[48],p[49],p[50],p[51]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[52],p[53],p[54],p[55]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[56],p[57],p[58],p[59]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[60],p[61],p[62],p[63]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[64],p[65],p[66],p[67]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[68],p[69],p[70],p[71]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[72],p[73],p[74],p[75]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[76],p[77],p[78],p[79]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[80],p[81],p[82],p[83]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[84],p[85],p[86],p[87]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[88],p[89],p[90],p[91]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[92],p[93],p[94],p[95]);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
p = &wholeMatrix[2*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[1] %x\n", &wholeMatrix[2*ROW_LEN_INT64]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[4*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[2] %x\n",&wholeMatrix[4*ROW_LEN_INT64]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[6*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[3] %x\n",&wholeMatrix[6*ROW_LEN_INT64]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV2 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
|
||||||
|
//printf("SV2 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
/*
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
|
||||||
|
printf("SV2 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
reducedDuplexRow1_2way( state, &wholeMatrix[0], &wholeMatrix[2*ROW_LEN_INT64],
|
||||||
nCols);
|
nCols);
|
||||||
|
|
||||||
|
|
||||||
|
//printf("SV3 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
/*
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
|
||||||
|
printf("SV3 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
|
||||||
|
*/
|
||||||
|
p = wholeMatrix;
|
||||||
|
printf("wholematrix[0]\n");
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[2*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[1]\n");
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[4*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[2]\n");
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[6*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[3]\n");
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV3 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
|
||||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
reducedDuplexRowSetup_2way( state, &wholeMatrix[2*prev*ROW_LEN_INT64],
|
||||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
&wholeMatrix[2*rowa0*ROW_LEN_INT64],
|
||||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
&wholeMatrix[2*row*ROW_LEN_INT64], nCols );
|
||||||
|
|
||||||
rowa = (rowa + step) & (window - 1);
|
rowa0 = (rowa0 + step) & (window - 1);
|
||||||
|
|
||||||
prev = row;
|
prev = row;
|
||||||
row++;
|
row++;
|
||||||
|
|
||||||
if (rowa == 0)
|
if (rowa0 == 0)
|
||||||
{
|
{
|
||||||
step = window + gap; //changes the step: approximately doubles its value
|
step = window + gap; //changes the step: approximately doubles its value
|
||||||
window *= 2; //doubles the size of the re-visitation window
|
window *= 2; //doubles the size of the re-visitation window
|
||||||
@@ -337,6 +599,80 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
|
|
||||||
} while (row < nRows);
|
} while (row < nRows);
|
||||||
|
|
||||||
|
|
||||||
|
p = wholeMatrix;
|
||||||
|
printf("wholematrix[0]\n");
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[2*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[1]\n");
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[4*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[2]\n");
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
p = &wholeMatrix[6*ROW_LEN_INT64];
|
||||||
|
printf("wholematrix[3]\n");
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//printf("SV5 prev= %d\n",prev);
|
||||||
|
/*
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV4 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
|
||||||
|
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
|
||||||
|
printf("SV4 S %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
//printf("Lyra2v3 4\n");
|
||||||
|
|
||||||
|
uint64_t *ptr0 = wholeMatrix; // base address for each lane
|
||||||
|
uint64_t *ptr1 = wholeMatrix + 4;
|
||||||
|
|
||||||
|
// convert a simple offset to an index into interleaved data.
|
||||||
|
// good for state and 4 row matrix.
|
||||||
|
// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
|
||||||
|
|
||||||
|
#define offset_to_index( o ) \
|
||||||
|
( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
|
||||||
|
|
||||||
row = 0;
|
row = 0;
|
||||||
for (tau = 1; tau <= timeCost; tau++)
|
for (tau = 1; tau <= timeCost; tau++)
|
||||||
{
|
{
|
||||||
@@ -344,24 +680,79 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
// This part is not parallel, rowa will be different for each lane.
|
// This part is not parallel, rowa will be different for each lane.
|
||||||
// state (u64[16]) is interleaved 2x256, need to extract seperately.
|
// state (u64[16]) is interleaved 2x256, need to extract seperately
|
||||||
|
// and figure out where the data is when interleaved.
|
||||||
|
// &state[0] (or matrix) is the start of lane 0, while &state[4]
|
||||||
|
// is the start of lane 1. From there there are 4 consecutive elements
|
||||||
|
// followed by 4 elements from the other lane that must be skipped.
|
||||||
|
|
||||||
// index = 2 * instance / 4 * 4 + instance % 4
|
povly ptr;
|
||||||
uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
|
ptr.u64 = wholeMatrix;
|
||||||
+ ( instance0 & 0x3 )
|
|
||||||
uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
|
|
||||||
+ ( instance1 & 0x3 )
|
|
||||||
|
|
||||||
instance0 = state[ index0 ] & 0xf;
|
/*
|
||||||
instance1 = (state+4)[ index1 ] & 0xf;
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
|
||||||
|
printf("SV4a %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
|
||||||
|
*
|
||||||
|
//printf("SV4a o to i %016lx = %016lx\n", instance0, offset_to_index( instance0 ) );
|
||||||
|
*/
|
||||||
|
instance0 = state[ offset_to_index( instance0 ) ];
|
||||||
|
instance1 = (&state[4])[ offset_to_index( instance1 ) ];
|
||||||
|
|
||||||
rowa0 = state[ instance0 ];
|
printf("SV4b o to i %016lx = %016lx, state0 %016lx\n", instance0, offset_to_index( instance0 ), state[offset_to_index( instance0 )] );
|
||||||
rowa1 = (state+4)[ instance1 ];
|
printf("SV4b o to i %016lx = %016lx, state1 %016lx\n", instance1, offset_to_index( instance1 ), (state+4)[offset_to_index( instance1 )] );
|
||||||
|
|
||||||
|
//printf("SV4b lane 1 instance1 = %d, rowa1= %d\n",instance1,rowa1);
|
||||||
|
|
||||||
reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
rowa0 = state[ offset_to_index( instance0 ) ]
|
||||||
&wholeMatrix[rowa0*ROW_LEN_INT64],
|
& (unsigned int)(nRows-1);
|
||||||
&wholeMatrix[rowa1*ROW_LEN_INT64],
|
rowa1 = (state+4)[ offset_to_index( instance1 ) ]
|
||||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
& (unsigned int)(nRows-1);
|
||||||
|
|
||||||
|
// matrix[prev] ie row 0, is messed up after rdr for row 1. ok after rdr 0
|
||||||
|
|
||||||
|
//printf("SV5 lane 1 instance1= %016lx, rowa1= %d\n",instance1,rowa1);
|
||||||
|
printf("SV5 row= %d, step= %d\n",row,step);
|
||||||
|
printf("SV5 instance0 %016lx, rowa0 %d, p0 %016lx\n",instance0,rowa0,ptr0[ 2* rowa0 * ROW_LEN_INT64 ]);
|
||||||
|
printf("SV5 instance1 %016lx, rowa1 %d, p1 %016lx\n",instance1,rowa1,ptr1[ 2* rowa1 * ROW_LEN_INT64 ]);
|
||||||
|
uint64_t *p = &wholeMatrix[2*rowa1*ROW_LEN_INT64];
|
||||||
|
printf("SV5 prev= %d\n",prev);
|
||||||
|
/*
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
|
||||||
|
printf("SV5 M %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
reducedDuplexRow_2way( state, ptr, prev, rowa0, rowa1, row, nCols );
|
||||||
|
|
||||||
|
/*
|
||||||
|
reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||||
|
&ptr0[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||||
|
&ptr1[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||||
|
&wholeMatrix[ 2* row*ROW_LEN_INT64], nCols );
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[271]);
|
||||||
|
printf("SV6 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
instance = state[instance & 0xF];
|
instance = state[instance & 0xF];
|
||||||
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
|
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
|
||||||
@@ -378,13 +769,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
|||||||
} while ( row != 0 );
|
} while ( row != 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
|
printf("SV7 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
|
||||||
squeeze( state, K, (unsigned int) kLen );
|
|
||||||
|
|
||||||
|
// rowa mismatches here so need to do a split read
|
||||||
|
absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64] );
|
||||||
|
|
||||||
|
squeeze_2way( state, K, (unsigned int) kLen );
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef offset_to_index
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||||
@@ -713,3 +1113,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
|||||||
|
|
||||||
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||||
|
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||||
|
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||||
|
|
||||||
|
//int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||||
|
// uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* LYRA2_H_ */
|
#endif /* LYRA2_H_ */
|
||||||
|
@@ -4,8 +4,212 @@
|
|||||||
#include "algo/blake/blake-hash-4way.h"
|
#include "algo/blake/blake-hash-4way.h"
|
||||||
#include "algo/bmw/bmw-hash-4way.h"
|
#include "algo/bmw/bmw-hash-4way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
|
|
||||||
#if defined (LYRA2REV3_8WAY)
|
#if defined (LYRA2REV3_16WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake256_16way_context blake;
|
||||||
|
cube_4way_context cube;
|
||||||
|
bmw256_16way_context bmw;
|
||||||
|
} lyra2v3_16way_ctx_holder;
|
||||||
|
|
||||||
|
static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
|
||||||
|
|
||||||
|
bool init_lyra2rev3_16way_ctx()
|
||||||
|
{
|
||||||
|
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||||
|
cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
|
||||||
|
bmw256_16way_init( &l2v3_16way_ctx.bmw );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void lyra2rev3_16way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash4[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash5[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash6[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash9[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash10[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash11[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash12[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash13[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||||
|
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||||
|
lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
|
||||||
|
|
||||||
|
blake256_16way_update( &ctx.blake, input + (64*16), 16 );
|
||||||
|
blake256_16way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||||
|
vhash, 256 );
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//printf("Lyra1 lane 0\n");
|
||||||
|
|
||||||
|
|
||||||
|
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
|
||||||
|
|
||||||
|
|
||||||
|
uint32_t h[8];
|
||||||
|
|
||||||
|
LYRA2REV3( l2v3_wholeMatrix, h, 32, hash1, 32, hash1, 32, 1, 4, 4 );
|
||||||
|
|
||||||
|
|
||||||
|
printf("S: %08x %08x %08x %08x %08x %08x %08x %08x\n",hash0[0],hash0[1],hash0[2],hash0[3],hash0[4],hash0[5],hash0[6],hash0[7]);
|
||||||
|
printf("V: %08x %08x %08x %08x %08x %08x %08x %08x\n",h[0],h[1],h[2],h[3],h[4],h[5],h[6],h[7]);
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
//printf("Lyra1 lane 2\n");
|
||||||
|
|
||||||
|
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||||
|
*/
|
||||||
|
|
||||||
|
//printf("cube\n");
|
||||||
|
|
||||||
|
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||||
|
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||||
|
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||||
|
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||||
|
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||||
|
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
|
||||||
|
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||||
|
dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
|
||||||
|
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||||
|
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||||
|
dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
|
||||||
|
|
||||||
|
//printf("Lyra2...\n");
|
||||||
|
/*
|
||||||
|
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash2, hash3, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash2, hash3, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash4, hash5, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash4, hash5, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||||
|
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||||
|
LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
|
||||||
|
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||||
|
hash15, 256 );
|
||||||
|
|
||||||
|
//printf("bmw\n");
|
||||||
|
|
||||||
|
bmw256_16way_update( &ctx.bmw, vhash, 32 );
|
||||||
|
bmw256_16way_close( &ctx.bmw, state );
|
||||||
|
|
||||||
|
//printf("done\n");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *hash7 = &hash[7<<3];
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
const uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||||
|
const int thr_id = mythr->id;
|
||||||
|
|
||||||
|
if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||||
|
|
||||||
|
blake256_16way_init( &l2v3_16way_ctx.blake );
|
||||||
|
// blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
|
||||||
|
n+11, n+10, n+ 9, n+ 8,
|
||||||
|
n+ 7, n+ 6, n+ 5, n+ 4,
|
||||||
|
n+ 3, n+ 2, n+ 1, n ) );
|
||||||
|
|
||||||
|
lyra2rev3_16way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 16; lane++ )
|
||||||
|
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||||
|
{
|
||||||
|
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||||
|
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += 16;
|
||||||
|
} while ( likely( (n < max_nonce-16) && !work_restart[thr_id].restart ) );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined (LYRA2REV3_8WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake256_8way_context blake;
|
blake256_8way_context blake;
|
||||||
|
@@ -19,7 +19,7 @@
|
|||||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "algo-gate.h"
|
//#include "algo-gate.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
@@ -31,21 +31,31 @@
|
|||||||
|
|
||||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||||
{
|
{
|
||||||
const int len_m256i = len / 32;
|
const int fullBlocks = len / 32;
|
||||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
|
||||||
__m512i* state = (__m512i*)State;
|
__m512i* state = (__m512i*)State;
|
||||||
__m512i* out = (__m512i*)Out;
|
__m512i* out = (__m512i*)Out;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
//printf("squeeze 1, len= %d, full %d\n", len,fullBlocks);
|
||||||
|
|
||||||
//Squeezes full blocks
|
//Squeezes full blocks
|
||||||
for ( i = 0; i < fullBlocks; i++ )
|
for ( i = 0; i < fullBlocks; i++ )
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//printf("squeeze 1, %d\n",i);
|
||||||
|
|
||||||
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
||||||
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
|
|
||||||
out += BLOCK_LEN_M256I*2;
|
//printf("squeeze 2\n");
|
||||||
|
|
||||||
|
LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
|
||||||
|
|
||||||
|
//printf("squeeze 2\n");
|
||||||
|
|
||||||
|
out += BLOCK_LEN_M256I;
|
||||||
}
|
}
|
||||||
//Squeezes remaining bytes
|
//Squeezes remaining bytes
|
||||||
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
|
// memcpy_512( out, state, ( (len * 2 ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
||||||
@@ -90,7 +100,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
|||||||
state1 = _mm512_xor_si512( state1, in[1] );
|
state1 = _mm512_xor_si512( state1, in[1] );
|
||||||
|
|
||||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
In += block_len * 2;
|
In += block_len*2;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)State, state0 );
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
@@ -109,7 +119,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
|||||||
|
|
||||||
|
|
||||||
register __m512i state0, state1, state2, state3;
|
register __m512i state0, state1, state2, state3;
|
||||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||||
|
|
||||||
state0 = _mm512_load_si512( (__m512i*)State );
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
@@ -132,7 +142,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
|||||||
out[2] = state2;
|
out[2] = state2;
|
||||||
|
|
||||||
//Goes to next block (column) that will receive the squeezed data
|
//Goes to next block (column) that will receive the squeezed data
|
||||||
out -= BLOCK_LEN_M256I * 2;
|
out -= BLOCK_LEN_M256I;
|
||||||
|
|
||||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
}
|
}
|
||||||
@@ -143,15 +153,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
|||||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function has to deal with gathering 2 256 bit rowin vectors from
|
|
||||||
// non-contiguous memory. Extra work and performance penalty.
|
|
||||||
|
|
||||||
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||||
uint64_t *rowOut, uint64_t nCols )
|
uint64_t *rowOut, uint64_t nCols )
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
register __m512i state0, state1, state2, state3;
|
register __m512i state0, state1, state2, state3;
|
||||||
__m512i *in = (__m256i*)rowIn;
|
__m512i *in = (__m512i*)rowIn;
|
||||||
|
__m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||||
|
|
||||||
state0 = _mm512_load_si512( (__m512i*)State );
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
@@ -171,17 +180,15 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||||
|
|
||||||
//Input: next column (i.e., next block in sequence)
|
//Input: next column (i.e., next block in sequence)
|
||||||
in0 += BLOCK_LEN_M256I;
|
in += BLOCK_LEN_M256I;
|
||||||
in1 += BLOCK_LEN_M256I;
|
|
||||||
//Output: goes to previous column
|
//Output: goes to previous column
|
||||||
out -= BLOCK_LEN_M256I * 2;
|
out -= BLOCK_LEN_M256I;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm512_store_si256( (__m512i*)State, state0 );
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
_mm512_store_si256( (__m512i*)State + 1, state1 );
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
_mm512_store_si256( (__m512i*)State + 2, state2 );
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
_mm512_store_si256( (__m512i*)State + 3, state3 );
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||||
@@ -192,7 +199,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
register __m512i state0, state1, state2, state3;
|
register __m512i state0, state1, state2, state3;
|
||||||
__m512i* in = (__m512i*)rowIn;
|
__m512i* in = (__m512i*)rowIn;
|
||||||
__m512i* inout = (__m512i*)rowInOut;
|
__m512i* inout = (__m512i*)rowInOut;
|
||||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||||
__m512i t0, t1, t2;
|
__m512i t0, t1, t2;
|
||||||
|
|
||||||
state0 = _mm512_load_si512( (__m512i*)State );
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
@@ -209,7 +216,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
state2 = _mm512_xor_si512( state2,
|
state2 = _mm512_xor_si512( state2,
|
||||||
_mm512_add_epi64( in[2], inout[2] ) );
|
_mm512_add_epi64( in[2], inout[2] ) );
|
||||||
|
|
||||||
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||||
@@ -221,17 +228,17 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||||
|
|
||||||
inout[0] = _mm512_xor_si512( inout[0],
|
inout[0] = _mm512_xor_si512( inout[0],
|
||||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
_mm512_mask_blend_epi32( 0x03, t0, t2 ) );
|
||||||
inout[1] = _mm512_xor_si512( inout[1],
|
inout[1] = _mm512_xor_si512( inout[1],
|
||||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
_mm512_mask_blend_epi32( 0x03, t1, t0 ) );
|
||||||
inout[2] = _mm512_xor_si512( inout[2],
|
inout[2] = _mm512_xor_si512( inout[2],
|
||||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
_mm512_mask_blend_epi32( 0x03, t2, t1 ) );
|
||||||
|
|
||||||
//Inputs: next column (i.e., next block in sequence)
|
//Inputs: next column (i.e., next block in sequence)
|
||||||
in += BLOCK_LEN_M256I * 2;
|
in += BLOCK_LEN_M256I;
|
||||||
inout += BLOCK_LEN_M256I * 2;
|
inout += BLOCK_LEN_M256I;
|
||||||
//Output: goes to previous column
|
//Output: goes to previous column
|
||||||
out -= BLOCK_LEN_M256I * 2;
|
out -= BLOCK_LEN_M256I;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)State, state0 );
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
@@ -240,53 +247,99 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
|||||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
// big ugly workaound for pointer aliasing, use a union of pointers.
|
||||||
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
|
// Access matrix using m512i for in and out, m256i for inout
|
||||||
uint64_t nCols )
|
inline void reducedDuplexRow_2way( uint64_t *State, povly matrix,
|
||||||
|
uint64_t rowIn,
|
||||||
|
uint64_t rowInOut0, uint64_t rowInOut1,
|
||||||
|
uint64_t rowOut, uint64_t nCols )
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
register __m512i state0, state1, state2, state3;
|
const uint64_t ROW_LEN_M256I = BLOCK_LEN_INT64 * nCols / 4;
|
||||||
__m256i *in0 = (__m256i*)rowIn0;
|
__m512i state0, state1, state2, state3;
|
||||||
__m256i *in0 = (__m256i*)rowIn0;
|
// register __m512i state0, state1, state2, state3;
|
||||||
__m2512* in = (__m512i*)rowIn;
|
__m512i *in = &matrix.v512[ rowIn * ROW_LEN_M256I ];
|
||||||
__m2512* inout = (__m512i*)rowInOut;
|
__m256i *inout0 = &matrix.v256[ 2 * rowInOut0 * ROW_LEN_M256I ];
|
||||||
__m512i* out = (__m512i*)rowOut;
|
__m256i *inout1 = &matrix.v256[ 2 * rowInOut1 * ROW_LEN_M256I ];
|
||||||
__m512i t0, t1, t2;
|
__m512i *out = &matrix.v512[ rowOut * ROW_LEN_M256I ];
|
||||||
|
__m512i io[3];
|
||||||
|
povly inout;
|
||||||
|
inout.v512 = &io[0];
|
||||||
|
__m512i t0, t1, t2;
|
||||||
|
|
||||||
_mm_prefetch( in0, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in1, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
|
|
||||||
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
|
|
||||||
|
|
||||||
state0 = _mm512_load_si512( (__m512i*)State );
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
_mm_prefetch( in, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout0, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout1, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in + 2, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout0 + 2, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout1 + 2, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout0 + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout1 + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in + 6, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout0 + 6, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( inout1 + 6, _MM_HINT_T0 );
|
||||||
|
|
||||||
|
//uint64_t *ii = (uint64_t*)in0;
|
||||||
|
//printf("RDRV0 IO %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
|
||||||
|
|
||||||
|
for ( i = 0; i < nCols; i++ )
|
||||||
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
//printf("RDR: loop %d\n",i);
|
||||||
|
uint64_t *io1 = (uint64_t*)inout1;
|
||||||
|
printf("RDRV0 col= %d\n", i);
|
||||||
|
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[0],io1[1],io1[2],io1[3]);
|
||||||
|
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[4],io1[5],io1[6],io1[7]);
|
||||||
|
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[8],io1[9],io1[10],io1[11]);
|
||||||
|
printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[12],io1[13],io1[14],io1[153]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
//Absorbing "M[prev] [+] M[row*]"
|
//Absorbing "M[prev] [+] M[row*]"
|
||||||
|
inout.v256[0] = inout0[0];
|
||||||
|
inout.v256[1] = inout1[1];
|
||||||
|
inout.v256[2] = inout0[2];
|
||||||
|
inout.v256[3] = inout1[3];
|
||||||
|
inout.v256[4] = inout0[4];
|
||||||
|
inout.v256[5] = inout1[5];
|
||||||
|
|
||||||
|
/*
|
||||||
|
uint64_t *io = (uint64_t*)inout.u64;
|
||||||
|
uint64_t *ii = (uint64_t*)in;
|
||||||
|
|
||||||
|
printf("RDRV1 col= %d\n", i);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
|
||||||
|
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
|
||||||
|
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
|
||||||
|
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
|
||||||
|
printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
|
||||||
|
*/
|
||||||
|
|
||||||
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
|
|
||||||
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
|
|
||||||
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
|
|
||||||
t0 = mm512_concat_256( in1[0], in0[0] );
|
|
||||||
t1 = mm512_concat_256( in1[1], in0[1] );
|
|
||||||
t2 = mm512_concat_256( in1[2], in0[2] );
|
|
||||||
|
|
||||||
state0 = _mm512_xor_si512( state0,
|
state0 = _mm512_xor_si512( state0,
|
||||||
_mm512_add_epi64( t0, inout[0] ) );
|
_mm512_add_epi64( in[0], inout.v512[0] ) );
|
||||||
state1 = _mm512_xor_si512( state1,
|
state1 = _mm512_xor_si512( state1,
|
||||||
_mm512_add_epi64( t1, inout[1] ) );
|
_mm512_add_epi64( in[1], inout.v512[1] ) );
|
||||||
state2 = _mm512_xor_si512( state2,
|
state2 = _mm512_xor_si512( state2,
|
||||||
_mm512_add_epi64( t2, inout[2] ) );
|
_mm512_add_epi64( in[2], inout.v512[2] ) );
|
||||||
|
|
||||||
|
//printf("RDR: round\n");
|
||||||
|
|
||||||
//Applies the reduced-round transformation f to the sponge's state
|
//Applies the reduced-round transformation f to the sponge's state
|
||||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
|
//printf("RDR 3\n");
|
||||||
|
|
||||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||||
out[0] = _mm512_xor_si512( out[0], state0 );
|
out[0] = _mm512_xor_si512( out[0], state0 );
|
||||||
out[1] = _mm512_xor_si512( out[1], state1 );
|
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||||
@@ -296,18 +349,76 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
|||||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||||
|
/*
|
||||||
|
uint64_t *st = (uint64_t*)&state0;
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
|
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
|
st = (uint64_t*)&state1;
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
|
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
|
st = (uint64_t*)&state2;
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
|
printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
|
|
||||||
inout[0] = _mm512_xor_si512( inout[0],
|
st = (uint64_t*)&t0;
|
||||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
printf("RDRV2 t0 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
inout[1] = _mm512_xor_si512( inout[1],
|
printf("RDRv2 t0 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
st = (uint64_t*)&t1;
|
||||||
inout[2] = _mm512_xor_si512( inout[2],
|
printf("RDRV2 t1 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
printf("RDRv2 t1 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
|
st = (uint64_t*)&t2;
|
||||||
|
printf("RDRV2 t2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
|
||||||
|
printf("RDRv2 t2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[8],st[9],st[10],st[11]);
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[12],st[13],st[14],st[15]);
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[16],st[17],st[18],st[19]);
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[20],st[21],st[22],st[23]);
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[24],st[25],st[26],st[271]);
|
||||||
|
printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[28],st[29],st[30],st[31]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
//printf("RDR 4\n");
|
||||||
|
/*
|
||||||
|
//uint64_t *io = (uint64_t*)&inout;
|
||||||
|
printf("RDRV1 col= %d\n", i);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
|
||||||
|
printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
// need to split inout for write
|
||||||
|
|
||||||
|
inout.v512[0] = _mm512_xor_si512( inout.v512[0],
|
||||||
|
_mm512_mask_blend_epi32( 0x03, t0, t2 ) );
|
||||||
|
inout.v512[1] = _mm512_xor_si512( inout.v512[1],
|
||||||
|
_mm512_mask_blend_epi32( 0x03, t1, t0 ) );
|
||||||
|
inout.v512[2] = _mm512_xor_si512( inout.v512[2],
|
||||||
|
_mm512_mask_blend_epi32( 0x03, t2, t1 ) );
|
||||||
|
/*
|
||||||
|
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
|
||||||
|
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
|
||||||
|
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
|
||||||
|
printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[153]);
|
||||||
|
*/
|
||||||
|
|
||||||
|
inout0[0] = inout.v256[0];
|
||||||
|
inout1[1] = inout.v256[1];
|
||||||
|
inout0[2] = inout.v256[2];
|
||||||
|
inout1[3] = inout.v256[3];
|
||||||
|
inout0[4] = inout.v256[4];
|
||||||
|
inout1[5] = inout.v256[5];
|
||||||
|
|
||||||
|
|
||||||
|
//printf("RDR 5\n");
|
||||||
|
|
||||||
//Goes to next block
|
//Goes to next block
|
||||||
in += BLOCK_LEN_M256I * 2;
|
in += BLOCK_LEN_M256I;
|
||||||
out += BLOCK_LEN_M256I * 2;
|
inout0 += BLOCK_LEN_M256I * 2;
|
||||||
inout += BLOCK_LEN_M256I * 2;
|
inout1 += BLOCK_LEN_M256I * 2;
|
||||||
|
out += BLOCK_LEN_M256I;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)State, state0 );
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G2W_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm512_ror_1x64( s1); \
|
s1 = mm512_ror256_64( s1); \
|
||||||
s2 = mm512_swap128_256( s2 ); \
|
s2 = mm512_swap256_128( s2 ); \
|
||||||
s3 = mm512_rol1x64_256( s3 ); \
|
s3 = mm512_rol256_64( s3 ); \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G2W_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm512_rol1x64_256( s1 ); \
|
s1 = mm512_rol256_64( s1 ); \
|
||||||
s2 = mm512_swap128_256( s2 ); \
|
s2 = mm512_swap256_128( s2 ); \
|
||||||
s3 = mm512_ror1x64_256( s3 );
|
s3 = mm512_ror256_64( s3 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm128_ror1x64_256( s2, s3 ); \
|
mm128_ror256_64( s2, s3 ); \
|
||||||
mm128_swap128_256( s4, s5 ); \
|
mm128_swap256_128( s4, s5 ); \
|
||||||
mm128_rol1x64_256( s6, s7 ); \
|
mm128_rol256_64( s6, s7 ); \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm128_rol1x64_256( s2, s3 ); \
|
mm128_rol256_64( s2, s3 ); \
|
||||||
mm128_swap128_256( s4, s5 ); \
|
mm128_swap256_128( s4, s5 ); \
|
||||||
mm128_ror1x64_256( s6, s7 );
|
mm128_ror256_64( s6, s7 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
@@ -220,7 +220,23 @@ void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
|
|||||||
uint64_t *rowOut, uint64_t nCols);
|
uint64_t *rowOut, uint64_t nCols);
|
||||||
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
||||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||||
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
/*
|
||||||
|
void reducedDuplexRow_2way( uint64_t *state, uint64_t *rowIn,
|
||||||
|
uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||||
|
uint64_t *rowOut, uint64_t nCols);
|
||||||
|
*/
|
||||||
|
|
||||||
|
union _povly
|
||||||
|
{
|
||||||
|
__m512i *v512;
|
||||||
|
__m256i *v256;
|
||||||
|
uint64_t *u64;
|
||||||
|
};
|
||||||
|
typedef union _povly povly;
|
||||||
|
|
||||||
|
void reducedDuplexRow_2way( uint64_t *state, povly matrix, uint64_t rowIn,
|
||||||
|
uint64_t rowInOut0, uint64_t rowInOut1,
|
||||||
|
uint64_t rowOut, uint64_t nCols);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
|||||||
{
|
{
|
||||||
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
|
@@ -56,7 +56,7 @@ typedef struct {
|
|||||||
__m128i val[8];
|
__m128i val[8];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
} sha256_4way_context;
|
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
void sha256_4way_init( sha256_4way_context *sc );
|
void sha256_4way_init( sha256_4way_context *sc );
|
||||||
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
|
||||||
@@ -71,7 +71,7 @@ typedef struct {
|
|||||||
__m256i val[8];
|
__m256i val[8];
|
||||||
uint32_t count_high, count_low;
|
uint32_t count_high, count_low;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
} sha256_8way_context;
|
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
void sha256_8way_init( sha256_8way_context *sc );
|
void sha256_8way_init( sha256_8way_context *sc );
|
||||||
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
|
void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
|
||||||
@@ -86,30 +86,32 @@ typedef struct {
|
|||||||
__m256i val[8];
|
__m256i val[8];
|
||||||
uint64_t count;
|
uint64_t count;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
} sha512_4way_context;
|
} sha512_4way_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
void sha512_4way_init( sha512_4way_context *sc);
|
void sha512_4way_init( sha512_4way_context *sc);
|
||||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
|
void sha512_4way_update( sha512_4way_context *sc, const void *data,
|
||||||
|
size_t len );
|
||||||
|
#define sha512_4way sha512_4way_update
|
||||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||||
|
|
||||||
// SHA-256 11 way hybrid
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
|
|
||||||
|
// SHA-512 8 way
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__m256i bufx[64>>2];
|
__m512i buf[128>>3];
|
||||||
__m256i valx[8];
|
__m512i val[8];
|
||||||
__m64 bufy[64>>2];
|
uint64_t count;
|
||||||
__m64 valy[8];
|
bool initialized;
|
||||||
uint32_t bufz[64>>2];
|
} sha512_8way_context __attribute__ ((aligned (128)));
|
||||||
uint32_t valz[8];
|
|
||||||
uint32_t count_high, count_low;
|
|
||||||
} sha256_11way_context;
|
|
||||||
|
|
||||||
void sha256_11way_init( sha256_11way_context *ctx );
|
void sha512_8way_init( sha512_8way_context *sc);
|
||||||
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
|
void sha512_8way_update( sha512_8way_context *sc, const void *data,
|
||||||
const void *datay, const void *dataz, size_t len );
|
size_t len );
|
||||||
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
|
void sha512_8way_close( sha512_8way_context *sc, void *dst );
|
||||||
void *dstz );
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
#endif // __SSE2__
|
#endif // __SSE2__
|
||||||
#endif // SHA256_4WAY_H__
|
#endif // SHA256_4WAY_H__
|
||||||
|
@@ -36,8 +36,6 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "sha-hash-4way.h"
|
#include "sha-hash-4way.h"
|
||||||
|
|
||||||
// SHA-512 4 way 64 bit
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static const sph_u64 H512[8] = {
|
static const sph_u64 H512[8] = {
|
||||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||||
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
|
|||||||
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// SHA-512 8 way 64 bit
|
||||||
|
|
||||||
|
#define CH8W(X, Y, Z) \
|
||||||
|
_mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z )
|
||||||
|
|
||||||
|
#define MAJ8W(X, Y, Z) \
|
||||||
|
_mm512_or_si512( _mm512_and_si512( X, Y ), \
|
||||||
|
_mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
|
||||||
|
|
||||||
|
#define BSG8W_5_0(x) \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( \
|
||||||
|
mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
|
||||||
|
|
||||||
|
#define BSG8W_5_1(x) \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( \
|
||||||
|
mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
|
||||||
|
|
||||||
|
#define SSG8W_5_0(x) \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( \
|
||||||
|
mm512_ror_64(x, 1), mm512_ror_64(x, 8) ), _mm512_srli_epi64(x, 7) )
|
||||||
|
|
||||||
|
#define SSG8W_5_1(x) \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( \
|
||||||
|
mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
|
||||||
|
|
||||||
|
static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
|
||||||
|
{
|
||||||
|
__m512i w0a, w1a, w0b, w1b;
|
||||||
|
w0a = mm512_ror_64( w0, 1 );
|
||||||
|
w1a = mm512_ror_64( w1,19 );
|
||||||
|
w0b = mm512_ror_64( w0, 8 );
|
||||||
|
w1b = mm512_ror_64( w1,61 );
|
||||||
|
w0a = _mm512_xor_si512( w0a, w0b );
|
||||||
|
w1a = _mm512_xor_si512( w1a, w1b );
|
||||||
|
w0b = _mm512_srli_epi64( w0, 7 );
|
||||||
|
w1b = _mm512_srli_epi64( w1, 6 );
|
||||||
|
w0a = _mm512_xor_si512( w0a, w0b );
|
||||||
|
w1a = _mm512_xor_si512( w1a, w1b );
|
||||||
|
return _mm512_add_epi64( w0a, w1a );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define SSG8W_512x2_0( w0, w1, i ) do \
|
||||||
|
{ \
|
||||||
|
__m512i X0a, X1a, X0b, X1b; \
|
||||||
|
X0a = mm512_ror_64( W[i-15], 1 ); \
|
||||||
|
X1a = mm512_ror_64( W[i-14], 1 ); \
|
||||||
|
X0b = mm512_ror_64( W[i-15], 8 ); \
|
||||||
|
X1b = mm512_ror_64( W[i-14], 8 ); \
|
||||||
|
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||||
|
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||||
|
X0b = _mm512_srli_epi64( W[i-15], 7 ); \
|
||||||
|
X1b = _mm512_srli_epi64( W[i-14], 7 ); \
|
||||||
|
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||||
|
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define SSG8W_512x2_1( w0, w1, i ) do \
|
||||||
|
{ \
|
||||||
|
__m512i X0a, X1a, X0b, X1b; \
|
||||||
|
X0a = mm512_ror_64( W[i-2],19 ); \
|
||||||
|
X1a = mm512_ror_64( W[i-1],19 ); \
|
||||||
|
X0b = mm512_ror_64( W[i-2],61 ); \
|
||||||
|
X1b = mm512_ror_64( W[i-1],61 ); \
|
||||||
|
X0a = _mm512_xor_si512( X0a, X0b ); \
|
||||||
|
X1a = _mm512_xor_si512( X1a, X1b ); \
|
||||||
|
X0b = _mm512_srli_epi64( W[i-2], 6 ); \
|
||||||
|
X1b = _mm512_srli_epi64( W[i-1], 6 ); \
|
||||||
|
w0 = _mm512_xor_si512( X0a, X0b ); \
|
||||||
|
w1 = _mm512_xor_si512( X1a, X1b ); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
|
||||||
|
do { \
|
||||||
|
__m512i T1, T2; \
|
||||||
|
__m512i K = _mm512_set1_epi64( K512[ i ] ); \
|
||||||
|
T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
|
||||||
|
K, W[i] ) ); \
|
||||||
|
T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
|
||||||
|
D = _mm512_add_epi64( D, T1 ); \
|
||||||
|
H = _mm512_add_epi64( T1, T2 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static void
|
||||||
|
sha512_8way_round( sha512_8way_context *ctx, __m512i *in, __m512i r[8] )
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
register __m512i A, B, C, D, E, F, G, H;
|
||||||
|
__m512i W[80];
|
||||||
|
|
||||||
|
mm512_block_bswap_64( W , in );
|
||||||
|
mm512_block_bswap_64( W+8, in+8 );
|
||||||
|
|
||||||
|
for ( i = 16; i < 80; i++ )
|
||||||
|
W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
|
||||||
|
_mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
|
||||||
|
|
||||||
|
if ( ctx->initialized )
|
||||||
|
{
|
||||||
|
A = r[0];
|
||||||
|
B = r[1];
|
||||||
|
C = r[2];
|
||||||
|
D = r[3];
|
||||||
|
E = r[4];
|
||||||
|
F = r[5];
|
||||||
|
G = r[6];
|
||||||
|
H = r[7];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
A = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
|
B = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||||
|
C = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||||
|
D = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||||
|
E = m512_const1_64( 0x510E527FADE682D1 );
|
||||||
|
F = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||||
|
G = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
|
H = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( i = 0; i < 80; i += 8 )
|
||||||
|
{
|
||||||
|
SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
|
||||||
|
SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
|
||||||
|
SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
|
||||||
|
SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
|
||||||
|
SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
|
||||||
|
SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
|
||||||
|
SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
|
||||||
|
SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ctx->initialized )
|
||||||
|
{
|
||||||
|
r[0] = _mm512_add_epi64( r[0], A );
|
||||||
|
r[1] = _mm512_add_epi64( r[1], B );
|
||||||
|
r[2] = _mm512_add_epi64( r[2], C );
|
||||||
|
r[3] = _mm512_add_epi64( r[3], D );
|
||||||
|
r[4] = _mm512_add_epi64( r[4], E );
|
||||||
|
r[5] = _mm512_add_epi64( r[5], F );
|
||||||
|
r[6] = _mm512_add_epi64( r[6], G );
|
||||||
|
r[7] = _mm512_add_epi64( r[7], H );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ctx->initialized = true;
|
||||||
|
r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
|
||||||
|
r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
|
||||||
|
r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
|
||||||
|
r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
|
||||||
|
r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
|
||||||
|
r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
|
||||||
|
r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
|
||||||
|
r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha512_8way_init( sha512_8way_context *sc )
|
||||||
|
{
|
||||||
|
sc->initialized = false;
|
||||||
|
sc->count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
__m512i *vdata = (__m512i*)data;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 128;
|
||||||
|
|
||||||
|
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if ( clen > len )
|
||||||
|
clen = len;
|
||||||
|
memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
|
||||||
|
vdata = vdata + (clen>>3);
|
||||||
|
ptr += clen;
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
sha512_8way_round( sc, sc->buf, sc->val );
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
sc->count += clen;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void sha512_8way_close( sha512_8way_context *sc, void *dst )
|
||||||
|
{
|
||||||
|
unsigned ptr;
|
||||||
|
const int buf_size = 128;
|
||||||
|
const int pad = buf_size - 16;
|
||||||
|
const __m512i shuff_bswap64 = m512_const_64(
|
||||||
|
0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||||
|
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||||
|
0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||||
|
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||||
|
|
||||||
|
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||||
|
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||||
|
ptr += 8;
|
||||||
|
if ( ptr > pad )
|
||||||
|
{
|
||||||
|
memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||||
|
sha512_8way_round( sc, sc->buf, sc->val );
|
||||||
|
memset_zero_512( sc->buf, pad >> 3 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||||
|
|
||||||
|
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
|
||||||
|
_mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
|
||||||
|
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
|
||||||
|
_mm512_set1_epi64( sc->count << 3 ), shuff_bswap64 );
|
||||||
|
sha512_8way_round( sc, sc->buf, sc->val );
|
||||||
|
|
||||||
|
mm512_block_bswap_64( dst, sc->val );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
|
// SHA-512 4 way 64 bit
|
||||||
|
|
||||||
|
|
||||||
#define CH(X, Y, Z) \
|
#define CH(X, Y, Z) \
|
||||||
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
|
||||||
|
|
||||||
@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
|
|||||||
sc->count = 0;
|
sc->count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
|
void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
|
@@ -33,7 +33,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __SSE4_1__
|
||||||
|
|
||||||
#include "shabal-hash-4way.h"
|
#include "shabal-hash-4way.h"
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
@@ -58,6 +58,599 @@ extern "C"{
|
|||||||
#define O2 9
|
#define O2 9
|
||||||
#define O3 6
|
#define O3 6
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
#define DECL_STATE8 \
|
||||||
|
__m256i A00, A01, A02, A03, A04, A05, A06, A07, \
|
||||||
|
A08, A09, A0A, A0B; \
|
||||||
|
__m256i B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||||
|
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||||
|
__m256i C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||||
|
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||||
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||||
|
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
sph_u32 Wlow, Whigh;
|
||||||
|
|
||||||
|
#define READ_STATE8(state) do \
|
||||||
|
{ \
|
||||||
|
if ( (state)->state_loaded ) \
|
||||||
|
{ \
|
||||||
|
A00 = (state)->A[0]; \
|
||||||
|
A01 = (state)->A[1]; \
|
||||||
|
A02 = (state)->A[2]; \
|
||||||
|
A03 = (state)->A[3]; \
|
||||||
|
A04 = (state)->A[4]; \
|
||||||
|
A05 = (state)->A[5]; \
|
||||||
|
A06 = (state)->A[6]; \
|
||||||
|
A07 = (state)->A[7]; \
|
||||||
|
A08 = (state)->A[8]; \
|
||||||
|
A09 = (state)->A[9]; \
|
||||||
|
A0A = (state)->A[10]; \
|
||||||
|
A0B = (state)->A[11]; \
|
||||||
|
B0 = (state)->B[0]; \
|
||||||
|
B1 = (state)->B[1]; \
|
||||||
|
B2 = (state)->B[2]; \
|
||||||
|
B3 = (state)->B[3]; \
|
||||||
|
B4 = (state)->B[4]; \
|
||||||
|
B5 = (state)->B[5]; \
|
||||||
|
B6 = (state)->B[6]; \
|
||||||
|
B7 = (state)->B[7]; \
|
||||||
|
B8 = (state)->B[8]; \
|
||||||
|
B9 = (state)->B[9]; \
|
||||||
|
BA = (state)->B[10]; \
|
||||||
|
BB = (state)->B[11]; \
|
||||||
|
BC = (state)->B[12]; \
|
||||||
|
BD = (state)->B[13]; \
|
||||||
|
BE = (state)->B[14]; \
|
||||||
|
BF = (state)->B[15]; \
|
||||||
|
C0 = (state)->C[0]; \
|
||||||
|
C1 = (state)->C[1]; \
|
||||||
|
C2 = (state)->C[2]; \
|
||||||
|
C3 = (state)->C[3]; \
|
||||||
|
C4 = (state)->C[4]; \
|
||||||
|
C5 = (state)->C[5]; \
|
||||||
|
C6 = (state)->C[6]; \
|
||||||
|
C7 = (state)->C[7]; \
|
||||||
|
C8 = (state)->C[8]; \
|
||||||
|
C9 = (state)->C[9]; \
|
||||||
|
CA = (state)->C[10]; \
|
||||||
|
CB = (state)->C[11]; \
|
||||||
|
CC = (state)->C[12]; \
|
||||||
|
CD = (state)->C[13]; \
|
||||||
|
CE = (state)->C[14]; \
|
||||||
|
CF = (state)->C[15]; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
(state)->state_loaded = true; \
|
||||||
|
A00 = m256_const1_64( 0x20728DFD20728DFD ); \
|
||||||
|
A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
|
||||||
|
A02 = m256_const1_64( 0xE782B699E782B699 ); \
|
||||||
|
A03 = m256_const1_64( 0x5530463255304632 ); \
|
||||||
|
A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
|
||||||
|
A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
|
||||||
|
A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
|
||||||
|
A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
|
||||||
|
A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
|
||||||
|
A09 = m256_const1_64( 0x8BD144108BD14410 ); \
|
||||||
|
A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
|
||||||
|
A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
|
||||||
|
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
|
||||||
|
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
|
||||||
|
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
|
||||||
|
B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
|
||||||
|
B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
|
||||||
|
B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
|
||||||
|
B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
|
||||||
|
B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
|
||||||
|
B8 = m256_const1_64( 0x48910A5A48910A5A ); \
|
||||||
|
B9 = m256_const1_64( 0x893B22DB893B22DB ); \
|
||||||
|
BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
|
||||||
|
BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
|
||||||
|
BC = m256_const1_64( 0x72D2F24072D2F240 ); \
|
||||||
|
BD = m256_const1_64( 0x75941D9975941D99 ); \
|
||||||
|
BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
|
||||||
|
BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
|
||||||
|
C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
|
||||||
|
C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
|
||||||
|
C2 = m256_const1_64( 0x56028CB256028CB2 ); \
|
||||||
|
C3 = m256_const1_64( 0x8134F3598134F359 ); \
|
||||||
|
C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
|
||||||
|
C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
|
||||||
|
C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
|
||||||
|
C7 = m256_const1_64( 0x0405278004052780 ); \
|
||||||
|
C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
|
||||||
|
C9 = m256_const1_64( 0x5194358F5194358F ); \
|
||||||
|
CA = m256_const1_64( 0x3C60D6653C60D665 ); \
|
||||||
|
CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
|
||||||
|
CC = m256_const1_64( 0x950C3434950C3434 ); \
|
||||||
|
CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
|
||||||
|
CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
|
||||||
|
CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
|
||||||
|
} \
|
||||||
|
Wlow = (state)->Wlow; \
|
||||||
|
Whigh = (state)->Whigh; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define WRITE_STATE8(state) do { \
|
||||||
|
(state)->A[0] = A00; \
|
||||||
|
(state)->A[1] = A01; \
|
||||||
|
(state)->A[2] = A02; \
|
||||||
|
(state)->A[3] = A03; \
|
||||||
|
(state)->A[4] = A04; \
|
||||||
|
(state)->A[5] = A05; \
|
||||||
|
(state)->A[6] = A06; \
|
||||||
|
(state)->A[7] = A07; \
|
||||||
|
(state)->A[8] = A08; \
|
||||||
|
(state)->A[9] = A09; \
|
||||||
|
(state)->A[10] = A0A; \
|
||||||
|
(state)->A[11] = A0B; \
|
||||||
|
(state)->B[0] = B0; \
|
||||||
|
(state)->B[1] = B1; \
|
||||||
|
(state)->B[2] = B2; \
|
||||||
|
(state)->B[3] = B3; \
|
||||||
|
(state)->B[4] = B4; \
|
||||||
|
(state)->B[5] = B5; \
|
||||||
|
(state)->B[6] = B6; \
|
||||||
|
(state)->B[7] = B7; \
|
||||||
|
(state)->B[8] = B8; \
|
||||||
|
(state)->B[9] = B9; \
|
||||||
|
(state)->B[10] = BA; \
|
||||||
|
(state)->B[11] = BB; \
|
||||||
|
(state)->B[12] = BC; \
|
||||||
|
(state)->B[13] = BD; \
|
||||||
|
(state)->B[14] = BE; \
|
||||||
|
(state)->B[15] = BF; \
|
||||||
|
(state)->C[0] = C0; \
|
||||||
|
(state)->C[1] = C1; \
|
||||||
|
(state)->C[2] = C2; \
|
||||||
|
(state)->C[3] = C3; \
|
||||||
|
(state)->C[4] = C4; \
|
||||||
|
(state)->C[5] = C5; \
|
||||||
|
(state)->C[6] = C6; \
|
||||||
|
(state)->C[7] = C7; \
|
||||||
|
(state)->C[8] = C8; \
|
||||||
|
(state)->C[9] = C9; \
|
||||||
|
(state)->C[10] = CA; \
|
||||||
|
(state)->C[11] = CB; \
|
||||||
|
(state)->C[12] = CC; \
|
||||||
|
(state)->C[13] = CD; \
|
||||||
|
(state)->C[14] = CE; \
|
||||||
|
(state)->C[15] = CF; \
|
||||||
|
(state)->Wlow = Wlow; \
|
||||||
|
(state)->Whigh = Whigh; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define DECODE_BLOCK8 \
|
||||||
|
do { \
|
||||||
|
M0 = buf[ 0]; \
|
||||||
|
M1 = buf[ 1]; \
|
||||||
|
M2 = buf[ 2]; \
|
||||||
|
M3 = buf[ 3]; \
|
||||||
|
M4 = buf[ 4]; \
|
||||||
|
M5 = buf[ 5]; \
|
||||||
|
M6 = buf[ 6]; \
|
||||||
|
M7 = buf[ 7]; \
|
||||||
|
M8 = buf[ 8]; \
|
||||||
|
M9 = buf[ 9]; \
|
||||||
|
MA = buf[10]; \
|
||||||
|
MB = buf[11]; \
|
||||||
|
MC = buf[12]; \
|
||||||
|
MD = buf[13]; \
|
||||||
|
ME = buf[14]; \
|
||||||
|
MF = buf[15]; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define INPUT_BLOCK_ADD8 \
|
||||||
|
do { \
|
||||||
|
B0 = _mm256_add_epi32( B0, M0 );\
|
||||||
|
B1 = _mm256_add_epi32( B1, M1 );\
|
||||||
|
B2 = _mm256_add_epi32( B2, M2 );\
|
||||||
|
B3 = _mm256_add_epi32( B3, M3 );\
|
||||||
|
B4 = _mm256_add_epi32( B4, M4 );\
|
||||||
|
B5 = _mm256_add_epi32( B5, M5 );\
|
||||||
|
B6 = _mm256_add_epi32( B6, M6 );\
|
||||||
|
B7 = _mm256_add_epi32( B7, M7 );\
|
||||||
|
B8 = _mm256_add_epi32( B8, M8 );\
|
||||||
|
B9 = _mm256_add_epi32( B9, M9 );\
|
||||||
|
BA = _mm256_add_epi32( BA, MA );\
|
||||||
|
BB = _mm256_add_epi32( BB, MB );\
|
||||||
|
BC = _mm256_add_epi32( BC, MC );\
|
||||||
|
BD = _mm256_add_epi32( BD, MD );\
|
||||||
|
BE = _mm256_add_epi32( BE, ME );\
|
||||||
|
BF = _mm256_add_epi32( BF, MF );\
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define INPUT_BLOCK_SUB8 \
|
||||||
|
do { \
|
||||||
|
C0 = _mm256_sub_epi32( C0, M0 ); \
|
||||||
|
C1 = _mm256_sub_epi32( C1, M1 ); \
|
||||||
|
C2 = _mm256_sub_epi32( C2, M2 ); \
|
||||||
|
C3 = _mm256_sub_epi32( C3, M3 ); \
|
||||||
|
C4 = _mm256_sub_epi32( C4, M4 ); \
|
||||||
|
C5 = _mm256_sub_epi32( C5, M5 ); \
|
||||||
|
C6 = _mm256_sub_epi32( C6, M6 ); \
|
||||||
|
C7 = _mm256_sub_epi32( C7, M7 ); \
|
||||||
|
C8 = _mm256_sub_epi32( C8, M8 ); \
|
||||||
|
C9 = _mm256_sub_epi32( C9, M9 ); \
|
||||||
|
CA = _mm256_sub_epi32( CA, MA ); \
|
||||||
|
CB = _mm256_sub_epi32( CB, MB ); \
|
||||||
|
CC = _mm256_sub_epi32( CC, MC ); \
|
||||||
|
CD = _mm256_sub_epi32( CD, MD ); \
|
||||||
|
CE = _mm256_sub_epi32( CE, ME ); \
|
||||||
|
CF = _mm256_sub_epi32( CF, MF ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define XOR_W8 \
|
||||||
|
do { \
|
||||||
|
A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
|
||||||
|
A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define SWAP_BC8 \
|
||||||
|
do { \
|
||||||
|
mm256_swap512_256( B0, C0 ); \
|
||||||
|
mm256_swap512_256( B1, C1 ); \
|
||||||
|
mm256_swap512_256( B2, C2 ); \
|
||||||
|
mm256_swap512_256( B3, C3 ); \
|
||||||
|
mm256_swap512_256( B4, C4 ); \
|
||||||
|
mm256_swap512_256( B5, C5 ); \
|
||||||
|
mm256_swap512_256( B6, C6 ); \
|
||||||
|
mm256_swap512_256( B7, C7 ); \
|
||||||
|
mm256_swap512_256( B8, C8 ); \
|
||||||
|
mm256_swap512_256( B9, C9 ); \
|
||||||
|
mm256_swap512_256( BA, CA ); \
|
||||||
|
mm256_swap512_256( BB, CB ); \
|
||||||
|
mm256_swap512_256( BC, CC ); \
|
||||||
|
mm256_swap512_256( BD, CD ); \
|
||||||
|
mm256_swap512_256( BE, CE ); \
|
||||||
|
mm256_swap512_256( BF, CF ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||||
|
do { \
|
||||||
|
xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256( \
|
||||||
|
_mm256_andnot_si256( xb3, xb2 ), \
|
||||||
|
_mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
|
||||||
|
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
|
||||||
|
) ), _mm256_set1_epi32(3UL) ) ) ) ); \
|
||||||
|
xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_STEP_0_8 do { \
|
||||||
|
PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
|
||||||
|
PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
|
||||||
|
PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
|
||||||
|
PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
|
||||||
|
PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
|
||||||
|
PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
|
||||||
|
PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
|
||||||
|
PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
|
||||||
|
PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
|
||||||
|
PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
|
||||||
|
PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
|
||||||
|
PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
|
||||||
|
PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
|
||||||
|
PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
|
||||||
|
PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
|
||||||
|
PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_STEP_1_8 do { \
|
||||||
|
PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
|
||||||
|
PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
|
||||||
|
PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
|
||||||
|
PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
|
||||||
|
PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
|
||||||
|
PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
|
||||||
|
PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
|
||||||
|
PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
|
||||||
|
PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
|
||||||
|
PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
|
||||||
|
PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
|
||||||
|
PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
|
||||||
|
PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
|
||||||
|
PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
|
||||||
|
PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
|
||||||
|
PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_STEP_2_8 do { \
|
||||||
|
PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
|
||||||
|
PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
|
||||||
|
PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
|
||||||
|
PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
|
||||||
|
PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
|
||||||
|
PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
|
||||||
|
PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
|
||||||
|
PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
|
||||||
|
PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
|
||||||
|
PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
|
||||||
|
PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
|
||||||
|
PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
|
||||||
|
PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
|
||||||
|
PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
|
||||||
|
PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
|
||||||
|
PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define APPLY_P8 \
|
||||||
|
do { \
|
||||||
|
B0 = mm256_ror_32( B0, 15 ); \
|
||||||
|
B1 = mm256_ror_32( B1, 15 ); \
|
||||||
|
B2 = mm256_ror_32( B2, 15 ); \
|
||||||
|
B3 = mm256_ror_32( B3, 15 ); \
|
||||||
|
B4 = mm256_ror_32( B4, 15 ); \
|
||||||
|
B5 = mm256_ror_32( B5, 15 ); \
|
||||||
|
B6 = mm256_ror_32( B6, 15 ); \
|
||||||
|
B7 = mm256_ror_32( B7, 15 ); \
|
||||||
|
B8 = mm256_ror_32( B8, 15 ); \
|
||||||
|
B9 = mm256_ror_32( B9, 15 ); \
|
||||||
|
BA = mm256_ror_32( BA, 15 ); \
|
||||||
|
BB = mm256_ror_32( BB, 15 ); \
|
||||||
|
BC = mm256_ror_32( BC, 15 ); \
|
||||||
|
BD = mm256_ror_32( BD, 15 ); \
|
||||||
|
BE = mm256_ror_32( BE, 15 ); \
|
||||||
|
BF = mm256_ror_32( BF, 15 ); \
|
||||||
|
PERM_STEP_0_8; \
|
||||||
|
PERM_STEP_1_8; \
|
||||||
|
PERM_STEP_2_8; \
|
||||||
|
A0B = _mm256_add_epi32( A0B, C6 ); \
|
||||||
|
A0A = _mm256_add_epi32( A0A, C5 ); \
|
||||||
|
A09 = _mm256_add_epi32( A09, C4 ); \
|
||||||
|
A08 = _mm256_add_epi32( A08, C3 ); \
|
||||||
|
A07 = _mm256_add_epi32( A07, C2 ); \
|
||||||
|
A06 = _mm256_add_epi32( A06, C1 ); \
|
||||||
|
A05 = _mm256_add_epi32( A05, C0 ); \
|
||||||
|
A04 = _mm256_add_epi32( A04, CF ); \
|
||||||
|
A03 = _mm256_add_epi32( A03, CE ); \
|
||||||
|
A02 = _mm256_add_epi32( A02, CD ); \
|
||||||
|
A01 = _mm256_add_epi32( A01, CC ); \
|
||||||
|
A00 = _mm256_add_epi32( A00, CB ); \
|
||||||
|
A0B = _mm256_add_epi32( A0B, CA ); \
|
||||||
|
A0A = _mm256_add_epi32( A0A, C9 ); \
|
||||||
|
A09 = _mm256_add_epi32( A09, C8 ); \
|
||||||
|
A08 = _mm256_add_epi32( A08, C7 ); \
|
||||||
|
A07 = _mm256_add_epi32( A07, C6 ); \
|
||||||
|
A06 = _mm256_add_epi32( A06, C5 ); \
|
||||||
|
A05 = _mm256_add_epi32( A05, C4 ); \
|
||||||
|
A04 = _mm256_add_epi32( A04, C3 ); \
|
||||||
|
A03 = _mm256_add_epi32( A03, C2 ); \
|
||||||
|
A02 = _mm256_add_epi32( A02, C1 ); \
|
||||||
|
A01 = _mm256_add_epi32( A01, C0 ); \
|
||||||
|
A00 = _mm256_add_epi32( A00, CF ); \
|
||||||
|
A0B = _mm256_add_epi32( A0B, CE ); \
|
||||||
|
A0A = _mm256_add_epi32( A0A, CD ); \
|
||||||
|
A09 = _mm256_add_epi32( A09, CC ); \
|
||||||
|
A08 = _mm256_add_epi32( A08, CB ); \
|
||||||
|
A07 = _mm256_add_epi32( A07, CA ); \
|
||||||
|
A06 = _mm256_add_epi32( A06, C9 ); \
|
||||||
|
A05 = _mm256_add_epi32( A05, C8 ); \
|
||||||
|
A04 = _mm256_add_epi32( A04, C7 ); \
|
||||||
|
A03 = _mm256_add_epi32( A03, C6 ); \
|
||||||
|
A02 = _mm256_add_epi32( A02, C5 ); \
|
||||||
|
A01 = _mm256_add_epi32( A01, C4 ); \
|
||||||
|
A00 = _mm256_add_epi32( A00, C3 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define INCR_W8 do { \
|
||||||
|
if ((Wlow = T32(Wlow + 1)) == 0) \
|
||||||
|
Whigh = T32(Whigh + 1); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static void
|
||||||
|
shabal_8way_init( void *cc, unsigned size )
|
||||||
|
{
|
||||||
|
shabal_8way_context *sc = (shabal_8way_context*)cc;
|
||||||
|
|
||||||
|
if ( size == 512 )
|
||||||
|
{ // copy immediate constants directly to working registers later.
|
||||||
|
sc->state_loaded = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ // No users
|
||||||
|
sc->state_loaded = true;
|
||||||
|
sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
|
||||||
|
sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
|
||||||
|
sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
|
||||||
|
sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
|
||||||
|
sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
|
||||||
|
sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
|
||||||
|
sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
|
||||||
|
sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
|
||||||
|
sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
|
||||||
|
sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
|
||||||
|
sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
|
||||||
|
sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
|
||||||
|
|
||||||
|
sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
|
||||||
|
sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
|
||||||
|
sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
|
||||||
|
sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
|
||||||
|
sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
|
||||||
|
sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
|
||||||
|
sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
|
||||||
|
sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
|
||||||
|
sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
|
||||||
|
sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
|
||||||
|
sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
|
||||||
|
sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
|
||||||
|
sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
|
||||||
|
sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
|
||||||
|
sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
|
||||||
|
sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
|
||||||
|
|
||||||
|
sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
|
||||||
|
sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
|
||||||
|
sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
|
||||||
|
sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
|
||||||
|
sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
|
||||||
|
sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
|
||||||
|
sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
|
||||||
|
sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
|
||||||
|
sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
|
||||||
|
sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
|
||||||
|
sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
|
||||||
|
sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
|
||||||
|
sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
|
||||||
|
sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
|
||||||
|
sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
|
||||||
|
sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
|
||||||
|
}
|
||||||
|
sc->Wlow = 1;
|
||||||
|
sc->Whigh = 0;
|
||||||
|
sc->ptr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shabal_8way_core( void *cc, const unsigned char *data, size_t len )
|
||||||
|
{
|
||||||
|
shabal_8way_context *sc = (shabal_8way_context*)cc;
|
||||||
|
__m256i *buf;
|
||||||
|
__m256i *vdata = (__m256i*)data;
|
||||||
|
const int buf_size = 64;
|
||||||
|
size_t ptr;
|
||||||
|
DECL_STATE8
|
||||||
|
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
|
||||||
|
if ( len < (buf_size - ptr ) )
|
||||||
|
{
|
||||||
|
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
|
||||||
|
ptr += len;
|
||||||
|
sc->ptr = ptr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
READ_STATE8( sc );
|
||||||
|
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if ( clen > len )
|
||||||
|
clen = len;
|
||||||
|
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
|
||||||
|
|
||||||
|
ptr += clen;
|
||||||
|
vdata += clen>>2;
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
DECODE_BLOCK8;
|
||||||
|
INPUT_BLOCK_ADD8;
|
||||||
|
XOR_W8;
|
||||||
|
APPLY_P8;
|
||||||
|
INPUT_BLOCK_SUB8;
|
||||||
|
SWAP_BC8;
|
||||||
|
INCR_W8;
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WRITE_STATE8(sc);
|
||||||
|
sc->ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
|
||||||
|
unsigned size_words )
|
||||||
|
{
|
||||||
|
shabal_8way_context *sc = (shabal_8way_context*)cc;
|
||||||
|
__m256i *buf;
|
||||||
|
const int buf_size = 64;
|
||||||
|
size_t ptr;
|
||||||
|
int i;
|
||||||
|
unsigned z, zz;
|
||||||
|
DECL_STATE8
|
||||||
|
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
z = 0x80 >> n;
|
||||||
|
zz = ((ub & -z) | z) & 0xFF;
|
||||||
|
buf[ptr>>2] = _mm256_set1_epi32( zz );
|
||||||
|
memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
|
||||||
|
READ_STATE8(sc);
|
||||||
|
DECODE_BLOCK8;
|
||||||
|
INPUT_BLOCK_ADD8;
|
||||||
|
XOR_W8;
|
||||||
|
APPLY_P8;
|
||||||
|
|
||||||
|
for ( i = 0; i < 3; i ++ )
|
||||||
|
{
|
||||||
|
SWAP_BC8;
|
||||||
|
XOR_W8;
|
||||||
|
APPLY_P8;
|
||||||
|
}
|
||||||
|
|
||||||
|
__m256i *d = (__m256i*)dst;
|
||||||
|
if ( size_words == 16 ) // 512
|
||||||
|
{
|
||||||
|
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
|
||||||
|
d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
|
||||||
|
d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
|
||||||
|
d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
|
||||||
|
}
|
||||||
|
else // 256
|
||||||
|
{
|
||||||
|
d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
|
||||||
|
d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal256_8way_init( void *cc )
|
||||||
|
{
|
||||||
|
shabal_8way_init(cc, 256);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal256_8way_update( void *cc, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
shabal_8way_core( cc, data, len );
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal256_8way_close( void *cc, void *dst )
|
||||||
|
{
|
||||||
|
shabal_8way_close(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
|
void *dst )
|
||||||
|
{
|
||||||
|
shabal_8way_close(cc, ub, n, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal512_8way_init(void *cc)
|
||||||
|
{
|
||||||
|
shabal_8way_init(cc, 512);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal512_8way_update(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
shabal_8way_core(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal512_8way_close(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
shabal_8way_close(cc, 0, 0, dst, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||||
|
{
|
||||||
|
shabal_8way_close(cc, ub, n, dst, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX2
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We copy the state into local variables, so that the compiler knows
|
* We copy the state into local variables, so that the compiler knows
|
||||||
* that it can optimize them at will.
|
* that it can optimize them at will.
|
||||||
@@ -290,6 +883,8 @@ do { \
|
|||||||
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
|
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
|
||||||
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
|
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define SWAP(v1, v2) do { \
|
#define SWAP(v1, v2) do { \
|
||||||
sph_u32 tmp = (v1); \
|
sph_u32 tmp = (v1); \
|
||||||
@@ -297,26 +892,39 @@ do { \
|
|||||||
(v2) = tmp; \
|
(v2) = tmp; \
|
||||||
} while (0)
|
} while (0)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define SWAP_BC \
|
#define SWAP_BC \
|
||||||
do { \
|
do { \
|
||||||
mm128_swap128_256( B0, C0 ); \
|
mm128_swap256_128( B0, C0 ); \
|
||||||
mm128_swap128_256( B1, C1 ); \
|
mm128_swap256_128( B1, C1 ); \
|
||||||
mm128_swap128_256( B2, C2 ); \
|
mm128_swap256_128( B2, C2 ); \
|
||||||
mm128_swap128_256( B3, C3 ); \
|
mm128_swap256_128( B3, C3 ); \
|
||||||
mm128_swap128_256( B4, C4 ); \
|
mm128_swap256_128( B4, C4 ); \
|
||||||
mm128_swap128_256( B5, C5 ); \
|
mm128_swap256_128( B5, C5 ); \
|
||||||
mm128_swap128_256( B6, C6 ); \
|
mm128_swap256_128( B6, C6 ); \
|
||||||
mm128_swap128_256( B7, C7 ); \
|
mm128_swap256_128( B7, C7 ); \
|
||||||
mm128_swap128_256( B8, C8 ); \
|
mm128_swap256_128( B8, C8 ); \
|
||||||
mm128_swap128_256( B9, C9 ); \
|
mm128_swap256_128( B9, C9 ); \
|
||||||
mm128_swap128_256( BA, CA ); \
|
mm128_swap256_128( BA, CA ); \
|
||||||
mm128_swap128_256( BB, CB ); \
|
mm128_swap256_128( BB, CB ); \
|
||||||
mm128_swap128_256( BC, CC ); \
|
mm128_swap256_128( BC, CC ); \
|
||||||
mm128_swap128_256( BD, CD ); \
|
mm128_swap256_128( BD, CD ); \
|
||||||
mm128_swap128_256( BE, CE ); \
|
mm128_swap256_128( BE, CE ); \
|
||||||
mm128_swap128_256( BF, CF ); \
|
mm128_swap256_128( BF, CF ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||||
|
do { \
|
||||||
|
__m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\
|
||||||
|
_mm_set1_epi32(5UL) ) \
|
||||||
|
__m128i t2 = _mm_xor_si128( xa0, xc ); \
|
||||||
|
xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
|
||||||
|
xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
|
||||||
|
_mm_xor_si128( t2, \
|
||||||
|
_mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
|
||||||
|
*/
|
||||||
|
|
||||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||||
do { \
|
do { \
|
||||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||||
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
shabal256_4way( void *cc, const void *data, size_t len )
|
shabal256_4way_update( void *cc, const void *data, size_t len )
|
||||||
{
|
{
|
||||||
shabal_4way_core( cc, data, len );
|
shabal_4way_core( cc, data, len );
|
||||||
}
|
}
|
||||||
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
shabal512_4way(void *cc, const void *data, size_t len)
|
shabal512_4way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
shabal_4way_core(cc, data, len);
|
shabal_4way_core(cc, data, len);
|
||||||
}
|
}
|
||||||
|
@@ -36,7 +36,7 @@
|
|||||||
#ifndef SHABAL_HASH_4WAY_H__
|
#ifndef SHABAL_HASH_4WAY_H__
|
||||||
#define SHABAL_HASH_4WAY_H__ 1
|
#define SHABAL_HASH_4WAY_H__ 1
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __SSE4_1__
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "algo/sha/sph_types.h"
|
#include "algo/sha/sph_types.h"
|
||||||
@@ -50,6 +50,34 @@ extern "C"{
|
|||||||
|
|
||||||
#define SPH_SIZE_shabal512 512
|
#define SPH_SIZE_shabal512 512
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m256i buf[16];
|
||||||
|
__m256i A[12], B[16], C[16];
|
||||||
|
sph_u32 Whigh, Wlow;
|
||||||
|
size_t ptr;
|
||||||
|
bool state_loaded;
|
||||||
|
} shabal_8way_context __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
typedef shabal_8way_context shabal256_8way_context;
|
||||||
|
typedef shabal_8way_context shabal512_8way_context;
|
||||||
|
|
||||||
|
void shabal256_8way_init( void *cc );
|
||||||
|
void shabal256_8way_update( void *cc, const void *data, size_t len );
|
||||||
|
void shabal256_8way_close( void *cc, void *dst );
|
||||||
|
void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
|
void *dst );
|
||||||
|
|
||||||
|
void shabal512_8way_init( void *cc );
|
||||||
|
void shabal512_8way_update( void *cc, const void *data, size_t len );
|
||||||
|
void shabal512_8way_close( void *cc, void *dst );
|
||||||
|
void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
|
void *dst );
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||||
__m128i A[12], B[16], C[16];
|
__m128i A[12], B[16], C[16];
|
||||||
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
|
|||||||
typedef shabal_4way_context shabal512_4way_context;
|
typedef shabal_4way_context shabal512_4way_context;
|
||||||
|
|
||||||
void shabal256_4way_init( void *cc );
|
void shabal256_4way_init( void *cc );
|
||||||
void shabal256_4way( void *cc, const void *data, size_t len );
|
void shabal256_4way_update( void *cc, const void *data, size_t len );
|
||||||
void shabal256_4way_close( void *cc, void *dst );
|
void shabal256_4way_close( void *cc, void *dst );
|
||||||
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
void *dst );
|
void *dst );
|
||||||
|
|
||||||
void shabal512_4way_init( void *cc );
|
void shabal512_4way_init( void *cc );
|
||||||
void shabal512_4way( void *cc, const void *data, size_t len );
|
void shabal512_4way_update( void *cc, const void *data, size_t len );
|
||||||
|
#define shabal512_4way shabal512_4way_update
|
||||||
void shabal512_4way_close( void *cc, void *dst );
|
void shabal512_4way_close( void *cc, void *dst );
|
||||||
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
void *dst );
|
void *dst );
|
||||||
|
@@ -3,6 +3,12 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
// This implementation is deprecated, superseded by VAES in Icelake
|
||||||
|
// which provides HW based 4 way aes.
|
||||||
|
// It was created for AVX2 to eliminate interleaving between the
|
||||||
|
// preceding and following function.
|
||||||
|
// This code can be removed when current users have reverted to one way.
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
|
||||||
@@ -16,8 +22,8 @@ static const uint32_t IV512[] =
|
|||||||
|
|
||||||
|
|
||||||
#define mm256_ror2x256hi_1x32( a, b ) \
|
#define mm256_ror2x256hi_1x32( a, b ) \
|
||||||
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
|
_mm256_blend_epi32( mm256_ror128_32( a ), \
|
||||||
mm256_ror1x32_128( b ), 0x88 )
|
mm256_ror128_32( b ), 0x88 )
|
||||||
|
|
||||||
static void
|
static void
|
||||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||||
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
{
|
{
|
||||||
// round 1, 5, 9
|
// round 1, 5, 9
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
|
k00 = _mm256_xor_si256( k13, mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ) );
|
mm256_aesenc_2x128( k00, zero ) ) );
|
||||||
|
|
||||||
if ( r == 0 )
|
if ( r == 0 )
|
||||||
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( k00,
|
k01 = _mm256_xor_si256( k00,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
|
||||||
|
|
||||||
if ( r == 1 )
|
if ( r == 1 )
|
||||||
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
|
||||||
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( k01,
|
k02 = _mm256_xor_si256( k01,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( k02,
|
k03 = _mm256_xor_si256( k02,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p3 = _mm256_xor_si256( p3, x );
|
p3 = _mm256_xor_si256( p3, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( k03,
|
k10 = _mm256_xor_si256( k03,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( k10,
|
k11 = _mm256_xor_si256( k10,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
k12 = _mm256_xor_si256( k11,
|
k12 = _mm256_xor_si256( k11,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( k12,
|
k13 = _mm256_xor_si256( k12,
|
||||||
mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
|
mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
|
||||||
|
|
||||||
if ( r == 2 )
|
if ( r == 2 )
|
||||||
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
|
||||||
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 3, 7, 11
|
// round 3, 7, 11
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p1 = _mm256_xor_si256( p1, x );
|
p1 = _mm256_xor_si256( p1, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
k12 = _mm256_xor_si256( mm256_ror1x32_128(
|
k12 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
mm256_aesenc_2x128( k12, zero ) ), k11 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||||
|
|
||||||
@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
|||||||
|
|
||||||
// round 13
|
// round 13
|
||||||
|
|
||||||
k00 = _mm256_xor_si256( mm256_ror1x32_128(
|
k00 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
mm256_aesenc_2x128( k00, zero ) ), k13 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
|
||||||
k01 = _mm256_xor_si256( mm256_ror1x32_128(
|
k01 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
mm256_aesenc_2x128( k01, zero ) ), k00 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
|
||||||
k02 = _mm256_xor_si256( mm256_ror1x32_128(
|
k02 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
mm256_aesenc_2x128( k02, zero ) ), k01 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
|
||||||
k03 = _mm256_xor_si256( mm256_ror1x32_128(
|
k03 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
mm256_aesenc_2x128( k03, zero ) ), k02 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
|
||||||
|
|
||||||
p3 = _mm256_xor_si256( p3, x );
|
p3 = _mm256_xor_si256( p3, x );
|
||||||
|
|
||||||
k10 = _mm256_xor_si256( mm256_ror1x32_128(
|
k10 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
mm256_aesenc_2x128( k10, zero ) ), k03 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
|
||||||
k11 = _mm256_xor_si256( mm256_ror1x32_128(
|
k11 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
mm256_aesenc_2x128( k11, zero ) ), k10 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
|
||||||
|
|
||||||
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
|
k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
|
||||||
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
|
||||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
|
||||||
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
||||||
|
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
|
||||||
k13 = _mm256_xor_si256( mm256_ror1x32_128(
|
k13 = _mm256_xor_si256( mm256_ror128_32(
|
||||||
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
mm256_aesenc_2x128( k13, zero ) ), k12 );
|
||||||
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
|
||||||
|
|
||||||
|
@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
|
|||||||
void c11_8way_hash( void *state, const void *input )
|
void c11_8way_hash( void *state, const void *input )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
|
|||||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
skein512_8way_close( &ctx.skein, vhash );
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
// Serial
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
|
||||||
vhash );
|
|
||||||
|
|
||||||
// 7 Luffa + 8 cube
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
|
||||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
|
||||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
|
||||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
|
||||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
|
||||||
luffa_4way_init( &ctx.luffa, 512 );
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
|
||||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
// 9 Shavite
|
// 9 Shavite
|
||||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
|
|||||||
void x11_8way_hash( void *state, const void *input )
|
void x11_8way_hash( void *state, const void *input )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
|
|||||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
keccak512_8way_close( &ctx.keccak, vhash );
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
vhash );
|
|
||||||
|
|
||||||
// Luffa + Cube
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
|
||||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
|
||||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
|
||||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
|
||||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
|
||||||
luffa_4way_init( &ctx.luffa, 512 );
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
|
||||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "x12-gate.h"
|
#include "x12-gate.h"
|
||||||
|
|
||||||
#if defined(X12_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -14,11 +11,223 @@
|
|||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||||
//#include "algo/fugue/sph_fugue.h"
|
|
||||||
|
#if defined(X12_8WAY)
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
} x12_8way_ctx_holder;
|
||||||
|
|
||||||
|
x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
void init_x12_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x12_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x12_8way_ctx.bmw );
|
||||||
|
init_groestl( &x12_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x12_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x12_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x12_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &x12_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x12_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x12_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x12_8way_ctx.echo, 512 );
|
||||||
|
hamsi512_8way_init( &x12_8way_ctx.hamsi );
|
||||||
|
};
|
||||||
|
|
||||||
|
void x12_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
x12_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
|
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, state );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *hash7 = &(hash[49]);
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do {
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x12_8way_hash( hash, vdata );
|
||||||
|
|
||||||
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
|
if ( hash7[ lane<<1 ] < Htarg )
|
||||||
|
{
|
||||||
|
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||||
|
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(X12_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
|
|||||||
x12_4way_ctx_holder ctx;
|
x12_4way_ctx_holder ctx;
|
||||||
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
|
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
|
||||||
|
|
||||||
// 1 Blake
|
|
||||||
blake512_4way( &ctx.blake, input, 80 );
|
blake512_4way( &ctx.blake, input, 80 );
|
||||||
blake512_4way_close( &ctx.blake, vhash );
|
blake512_4way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
// 2 Bmw
|
|
||||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||||
bmw512_4way_close( &ctx.bmw, vhash );
|
bmw512_4way_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
// Serial
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
// 3 Groestl
|
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
|
||||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
|
||||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
|
||||||
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
|
||||||
|
|
||||||
// Parallel 4way 64 bit
|
|
||||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
|
||||||
|
|
||||||
// 4 Skein
|
|
||||||
skein512_4way( &ctx.skein, vhash, 64 );
|
|
||||||
skein512_4way_close( &ctx.skein, vhash );
|
|
||||||
|
|
||||||
// 5 JH
|
|
||||||
jh512_4way( &ctx.jh, vhash, 64 );
|
|
||||||
jh512_4way_close( &ctx.jh, vhash );
|
|
||||||
|
|
||||||
// 6 Keccak
|
|
||||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
|
||||||
keccak512_4way_close( &ctx.keccak, vhash );
|
|
||||||
|
|
||||||
// Serial
|
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
|
||||||
|
|
||||||
// 7 Luffa
|
|
||||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||||
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
|
|||||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
// 8 Cubehash
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||||
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
|
|||||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||||
|
|
||||||
// 9 Shavite
|
|
||||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
|
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
|
||||||
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
|
|||||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
|
||||||
// 10 Simd
|
|
||||||
intrlv_2x128( vhash, hash0, hash1, 512 );
|
intrlv_2x128( vhash, hash0, hash1, 512 );
|
||||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||||
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
|
|||||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
// 11 Echo
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
(const BitSequence *) hash0, 512 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
(const BitSequence *) hash1, 512 );
|
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
|
||||||
(const BitSequence *) hash2, 512 );
|
|
||||||
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
|
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
|
||||||
(const BitSequence *) hash3, 512 );
|
|
||||||
|
|
||||||
// 12 Hamsi parallel 4way 32 bit
|
// Parallel 4way 64 bit
|
||||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
|
skein512_4way( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_4way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_4way( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_4way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_4way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
hamsi512_4way( &ctx.hamsi, vhash, 64 );
|
||||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||||
|
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_x12_algo( algo_gate_t* gate )
|
bool register_x12_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X12_4WAY)
|
#if defined (X12_8WAY)
|
||||||
|
init_x12_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x12_8way;
|
||||||
|
gate->hash = (void*)&x12_8way_hash;
|
||||||
|
#elif defined (X12_4WAY)
|
||||||
init_x12_4way_ctx();
|
init_x12_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x12_4way;
|
gate->scanhash = (void*)&scanhash_x12_4way;
|
||||||
gate->hash = (void*)&x12_4way_hash;
|
gate->hash = (void*)&x12_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x12;
|
gate->scanhash = (void*)&scanhash_x12;
|
||||||
gate->hash = (void*)&x12hash;
|
gate->hash = (void*)&x12hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,36 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X12_4WAY
|
#define X12_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X12_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_x12_algo( algo_gate_t* gate );
|
bool register_x12_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X12_4WAY)
|
#if defined(X12_8WAY)
|
||||||
|
|
||||||
|
void x12_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x12_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X12_4WAY)
|
||||||
|
|
||||||
void x12_4way_hash( void *state, const void *input );
|
void x12_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x12_4way_ctx();
|
void init_x12_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x12hash( void *state, const void *input );
|
void x12hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x12( struct work *work, uint32_t max_nonce,
|
int scanhash_x12( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x12_ctx();
|
void init_x12_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
146
algo/x12/x12.c
146
algo/x12/x12.c
@@ -20,35 +20,40 @@
|
|||||||
#include "algo/luffa/luffa_for_sse2.h"
|
#include "algo/luffa/luffa_for_sse2.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/simd/nist.h"
|
#include "algo/simd/nist.h"
|
||||||
#include "algo/blake/sse2/blake.c"
|
|
||||||
#include "algo/bmw/sse2/bmw.c"
|
|
||||||
#include "algo/keccak/sse2/keccak.c"
|
|
||||||
#include "algo/skein/sse2/skein.c"
|
|
||||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
sph_blake512_context blake;
|
||||||
|
sph_bmw512_context bmw;
|
||||||
|
sph_skein512_context skein;
|
||||||
|
sph_jh512_context jh;
|
||||||
|
sph_keccak512_context keccak;
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
hashState_groestl groestl;
|
hashState_groestl groestl;
|
||||||
hashState_echo echo;
|
hashState_echo echo;
|
||||||
#else
|
#else
|
||||||
sph_groestl512_context groestl;
|
sph_groestl512_context groestl;
|
||||||
sph_echo512_context echo;
|
sph_echo512_context echo;
|
||||||
#endif
|
#endif
|
||||||
hashState_luffa luffa;
|
hashState_luffa luffa;
|
||||||
cubehashParam cubehash;
|
cubehashParam cubehash;
|
||||||
sph_shavite512_context shavite;
|
sph_shavite512_context shavite;
|
||||||
hashState_sd simd;
|
hashState_sd simd;
|
||||||
sph_hamsi512_context hamsi;
|
sph_hamsi512_context hamsi;
|
||||||
} x12_ctx_holder;
|
} x12_ctx_holder;
|
||||||
|
|
||||||
x12_ctx_holder x12_ctx;
|
x12_ctx_holder x12_ctx;
|
||||||
|
|
||||||
void init_x12_ctx()
|
void init_x12_ctx()
|
||||||
{
|
{
|
||||||
|
sph_blake512_init( &x12_ctx.blake );
|
||||||
|
sph_bmw512_init( &x12_ctx.bmw );
|
||||||
|
sph_skein512_init( &x12_ctx.skein);
|
||||||
|
sph_jh512_init( &x12_ctx.jh);
|
||||||
|
sph_keccak512_init( &x12_ctx.keccak);
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
init_echo( &x12_ctx.echo, 512 );
|
init_echo( &x12_ctx.echo, 512 );
|
||||||
init_groestl (&x12_ctx.groestl, 64 );
|
init_groestl (&x12_ctx.groestl, 64 );
|
||||||
@@ -65,102 +70,59 @@ void init_x12_ctx()
|
|||||||
|
|
||||||
void x12hash(void *output, const void *input)
|
void x12hash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
|
|
||||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||||
#define hashB hash+64
|
#define hashB hash+64
|
||||||
|
|
||||||
x12_ctx_holder ctx;
|
x12_ctx_holder ctx;
|
||||||
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
|
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
|
||||||
|
|
||||||
// X11 algos
|
sph_blake512(&ctx.blake, input, 80);
|
||||||
|
sph_blake512_close(&ctx.blake, hash);
|
||||||
|
|
||||||
unsigned char hashbuf[128];
|
sph_bmw512(&ctx.bmw, hash, 64);
|
||||||
size_t hashptr;
|
sph_bmw512_close(&ctx.bmw, hash);
|
||||||
sph_u64 hashctA;
|
|
||||||
sph_u64 hashctB;
|
|
||||||
|
|
||||||
//---blake1---
|
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
||||||
|
(const BitSequence*)hash, 64 );
|
||||||
|
|
||||||
DECL_BLK;
|
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
|
||||||
BLK_I;
|
(const byte*)hashB, 64 );
|
||||||
BLK_W;
|
|
||||||
BLK_C;
|
|
||||||
|
|
||||||
//---bmw2---
|
sph_shavite512( &ctx.shavite, hash, 64);
|
||||||
|
sph_shavite512_close( &ctx.shavite, hashB);
|
||||||
|
|
||||||
DECL_BMW;
|
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||||
BMW_I;
|
(const BitSequence *)hashB, 512 );
|
||||||
BMW_U;
|
|
||||||
|
|
||||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
|
||||||
#define H(x) (h[x])
|
|
||||||
#define dH(x) (dh[x])
|
|
||||||
|
|
||||||
BMW_C;
|
|
||||||
|
|
||||||
#undef M
|
|
||||||
#undef H
|
|
||||||
#undef dH
|
|
||||||
|
|
||||||
//---groetl----
|
|
||||||
|
|
||||||
#if defined(__AES__)
|
#if defined(__AES__)
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
||||||
(const char*)hash, 512 );
|
|
||||||
#else
|
|
||||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
|
||||||
sph_groestl512_close(&ctx.groestl, hash);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//---skein4---
|
|
||||||
|
|
||||||
DECL_SKN;
|
|
||||||
SKN_I;
|
|
||||||
SKN_U;
|
|
||||||
SKN_C;
|
|
||||||
|
|
||||||
//---jh5------
|
|
||||||
|
|
||||||
DECL_JH;
|
|
||||||
JH_H;
|
|
||||||
|
|
||||||
//---keccak6---
|
|
||||||
|
|
||||||
DECL_KEC;
|
|
||||||
KEC_I;
|
|
||||||
KEC_U;
|
|
||||||
KEC_C;
|
|
||||||
|
|
||||||
//--- luffa7
|
|
||||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
|
||||||
(const BitSequence*)hash, 64 );
|
|
||||||
|
|
||||||
// 8 Cube
|
|
||||||
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
|
|
||||||
(const byte*)hashB, 64 );
|
|
||||||
|
|
||||||
// 9 Shavite
|
|
||||||
sph_shavite512( &ctx.shavite, hash, 64);
|
|
||||||
sph_shavite512_close( &ctx.shavite, hashB);
|
|
||||||
|
|
||||||
// 10 Simd
|
|
||||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
|
||||||
(const BitSequence *)hashB, 512 );
|
|
||||||
|
|
||||||
//11---echo---
|
|
||||||
|
|
||||||
#if defined(__AES__)
|
|
||||||
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
|
||||||
(const BitSequence *)hash, 512 );
|
(const BitSequence *)hash, 512 );
|
||||||
#else
|
#else
|
||||||
sph_echo512(&ctx.echo, hash, 64);
|
sph_echo512(&ctx.echo, hash, 64);
|
||||||
sph_echo512_close(&ctx.echo, hashB);
|
sph_echo512_close(&ctx.echo, hashB);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 12 Hamsi
|
#if defined(__AES__)
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||||
|
(const char*)hash, 512 );
|
||||||
|
#else
|
||||||
|
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||||
|
sph_groestl512_close(&ctx.groestl, hash);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
sph_skein512(&ctx.skein, hash, 64);
|
||||||
|
sph_skein512_close(&ctx.skein, hash);
|
||||||
|
|
||||||
|
sph_jh512(&ctx.jh, hash, 64);
|
||||||
|
sph_jh512_close(&ctx.jh, hash);
|
||||||
|
|
||||||
|
sph_keccak512(&ctx.keccak, hash, 64);
|
||||||
|
sph_keccak512_close(&ctx.keccak, hash);
|
||||||
|
|
||||||
sph_hamsi512(&ctx.hamsi, hashB, 64);
|
sph_hamsi512(&ctx.hamsi, hashB, 64);
|
||||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||||
|
|
||||||
asm volatile ("emms");
|
|
||||||
memcpy(output, hashB, 32);
|
memcpy(output, hashB, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "x13-gate.h"
|
#include "x13-gate.h"
|
||||||
|
|
||||||
#if defined(X13_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -14,12 +11,270 @@
|
|||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||||
#include "algo/fugue/sph_fugue.h"
|
#include "algo/fugue/sph_fugue.h"
|
||||||
|
|
||||||
|
#if defined(X13_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
} x13_8way_ctx_holder;
|
||||||
|
|
||||||
|
x13_8way_ctx_holder x13_8way_ctx;
|
||||||
|
|
||||||
|
void init_x13_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x13_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x13_8way_ctx.bmw );
|
||||||
|
init_groestl( &x13_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x13_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x13_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x13_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &x13_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x13_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x13_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x13_8way_ctx.echo, 512 );
|
||||||
|
hamsi512_8way_init( &x13_8way_ctx.hamsi );
|
||||||
|
sph_fugue512_init( &x13_8way_ctx.fugue );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x13_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
x13_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
|
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 13 Fugue serial
|
||||||
|
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const uint32_t last_nonce = max_nonce -8;
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x13_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( ( hash+(i<<3) )[7] < Htarg
|
||||||
|
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined(X13_4WAY)
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_x13_algo( algo_gate_t* gate )
|
bool register_x13_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X13_4WAY)
|
#if defined (X13_8WAY)
|
||||||
|
init_x13_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x13_8way;
|
||||||
|
gate->hash = (void*)&x13_8way_hash;
|
||||||
|
#elif defined (X13_4WAY)
|
||||||
init_x13_4way_ctx();
|
init_x13_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x13_4way;
|
gate->scanhash = (void*)&scanhash_x13_4way;
|
||||||
gate->hash = (void*)&x13_4way_hash;
|
gate->hash = (void*)&x13_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x13;
|
gate->scanhash = (void*)&scanhash_x13;
|
||||||
gate->hash = (void*)&x13hash;
|
gate->hash = (void*)&x13hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,35 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X13_4WAY
|
#define X13_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X13_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_x13_algo( algo_gate_t* gate );
|
bool register_x13_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X13_4WAY)
|
#if defined(X13_8WAY)
|
||||||
|
|
||||||
|
void x13_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x13_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X13_4WAY)
|
||||||
|
|
||||||
void x13_4way_hash( void *state, const void *input );
|
void x13_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x13_4way_ctx();
|
void init_x13_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x13hash( void *state, const void *input );
|
void x13hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x13( struct work *work, uint32_t max_nonce,
|
int scanhash_x13( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x13_ctx();
|
void init_x13_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "x14-gate.h"
|
#include "x14-gate.h"
|
||||||
|
|
||||||
#if defined(X14_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -13,6 +10,7 @@
|
|||||||
#include "algo/jh/jh-hash-4way.h"
|
#include "algo/jh/jh-hash-4way.h"
|
||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
@@ -22,6 +20,263 @@
|
|||||||
#include "algo/fugue/sph_fugue.h"
|
#include "algo/fugue/sph_fugue.h"
|
||||||
#include "algo/shabal/shabal-hash-4way.h"
|
#include "algo/shabal/shabal-hash-4way.h"
|
||||||
|
|
||||||
|
#if defined(X14_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
shabal512_8way_context shabal;
|
||||||
|
} x14_8way_ctx_holder;
|
||||||
|
|
||||||
|
x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
void init_x14_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x14_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x14_8way_ctx.bmw );
|
||||||
|
init_groestl( &x14_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x14_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x14_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x14_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &x14_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x14_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x14_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x14_8way_ctx.echo, 512 );
|
||||||
|
hamsi512_8way_init( &x14_8way_ctx.hamsi );
|
||||||
|
sph_fugue512_init( &x14_8way_ctx.fugue );
|
||||||
|
shabal512_8way_init( &x14_8way_ctx.shabal );
|
||||||
|
};
|
||||||
|
|
||||||
|
void x14_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
x14_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
|
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 13 Fugue serial
|
||||||
|
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
|
||||||
|
// 14 Shabal, parallel 32 bit
|
||||||
|
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
shabal512_8way_update( &ctx.shabal, vhash, 64 );
|
||||||
|
shabal512_8way_close( &ctx.shabal, state );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x14_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
uint32_t *hash7 = &(hash[7<<3]);
|
||||||
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
|
if ( hash7[ lane ] < Htarg )
|
||||||
|
{
|
||||||
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
|
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||||
|
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
|
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(X14_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()
|
|||||||
|
|
||||||
void x14_4way_hash( void *state, const void *input )
|
void x14_4way_hash( void *state, const void *input )
|
||||||
{
|
{
|
||||||
|
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
|
||||||
x14_4way_ctx_holder ctx;
|
x14_4way_ctx_holder ctx;
|
||||||
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
|
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
|
||||||
|
|
||||||
@@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
// 14 Shabal, parallel 32 bit
|
// 14 Shabal, parallel 32 bit
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
shabal512_4way_update( &ctx.shabal, vhash, 64 );
|
||||||
shabal512_4way_close( &ctx.shabal, state );
|
shabal512_4way_close( &ctx.shabal, state );
|
||||||
}
|
}
|
||||||
|
|
||||||
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
uint32_t hash[4*16] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id;
|
||||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
|
||||||
0xFFF, 0xFFFF, 0x10000000 };
|
|
||||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
|
||||||
0xFFFFF000, 0xFFFF0000, 0 };
|
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
|
|
||||||
for ( int m=0; m < 6; m++ )
|
do
|
||||||
if ( Htarg <= htmax[m] )
|
{
|
||||||
|
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||||
|
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x14_4way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
uint32_t *hash7 = &(hash[7<<2]);
|
||||||
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
|
if ( hash7[ lane ] < Htarg )
|
||||||
{
|
{
|
||||||
uint32_t mask = masks[m];
|
uint32_t lane_hash[8];
|
||||||
do
|
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||||
{
|
|
||||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
|
||||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
|
||||||
|
|
||||||
x14_4way_hash( hash, vdata );
|
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||||
pdata[19] = n;
|
{
|
||||||
|
pdata[19] = n + lane;
|
||||||
uint32_t *hash7 = &(hash[7<<2]);
|
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||||
|
}
|
||||||
for ( int lane = 0; lane < 4; lane++ )
|
}
|
||||||
if ( ( hash7[ lane ] & mask ) == 0 )
|
n += 4;
|
||||||
{
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
// deinterleave hash for lane
|
*hashes_done = n - first_nonce;
|
||||||
uint32_t lane_hash[8];
|
|
||||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
|
||||||
|
|
||||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
|
||||||
{
|
|
||||||
pdata[19] = n + lane;
|
|
||||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
n += 4;
|
|
||||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
*hashes_done = n - first_nonce + 1;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_x14_algo( algo_gate_t* gate )
|
bool register_x14_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X14_4WAY)
|
#if defined (X14_8WAY)
|
||||||
|
init_x14_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x14_8way;
|
||||||
|
gate->hash = (void*)&x14_8way_hash;
|
||||||
|
#elif defined (X14_4WAY)
|
||||||
init_x14_4way_ctx();
|
init_x14_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x14_4way;
|
gate->scanhash = (void*)&scanhash_x14_4way;
|
||||||
gate->hash = (void*)&x14_4way_hash;
|
gate->hash = (void*)&x14_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x14;
|
gate->scanhash = (void*)&scanhash_x14;
|
||||||
gate->hash = (void*)&x14hash;
|
gate->hash = (void*)&x14hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,20 +4,29 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X14_4WAY
|
#define X14_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X14_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_x14_algo( algo_gate_t* gate );
|
bool register_x14_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X14_4WAY)
|
#if defined(X14_8WAY)
|
||||||
|
|
||||||
|
void x14_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x14_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X14_4WAY)
|
||||||
|
|
||||||
void x14_4way_hash( void *state, const void *input );
|
void x14_4way_hash( void *state, const void *input );
|
||||||
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void init_x14_4way_ctx();
|
void init_x14_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x14hash( void *state, const void *input );
|
void x14hash( void *state, const void *input );
|
||||||
int scanhash_x14( struct work *work, uint32_t max_nonce,
|
int scanhash_x14( struct work *work, uint32_t max_nonce,
|
||||||
@@ -26,3 +35,4 @@ void init_x14_ctx();
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "x15-gate.h"
|
#include "x15-gate.h"
|
||||||
|
|
||||||
#if defined(X15_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -14,6 +11,7 @@
|
|||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
@@ -23,6 +21,309 @@
|
|||||||
#include "algo/shabal/shabal-hash-4way.h"
|
#include "algo/shabal/shabal-hash-4way.h"
|
||||||
#include "algo/whirlpool/sph_whirlpool.h"
|
#include "algo/whirlpool/sph_whirlpool.h"
|
||||||
|
|
||||||
|
#if defined(X15_8WAY)
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
shabal512_8way_context shabal;
|
||||||
|
sph_whirlpool_context whirlpool;
|
||||||
|
} x15_8way_ctx_holder;
|
||||||
|
|
||||||
|
x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
void init_x15_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x15_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x15_8way_ctx.bmw );
|
||||||
|
init_groestl( &x15_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x15_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x15_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x15_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &x15_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x15_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x15_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x15_8way_ctx.echo, 512 );
|
||||||
|
hamsi512_8way_init( &x15_8way_ctx.hamsi );
|
||||||
|
sph_fugue512_init( &x15_8way_ctx.fugue );
|
||||||
|
shabal512_8way_init( &x15_8way_ctx.shabal );
|
||||||
|
sph_whirlpool_init( &x15_8way_ctx.whirlpool );
|
||||||
|
};
|
||||||
|
|
||||||
|
void x15_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
x15_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
|
||||||
|
|
||||||
|
// 1 Blake
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
// 2 Bmw
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 3 Groestl
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
// 5 JH
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
// 6 Keccak
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
|
||||||
|
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
|
||||||
|
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
|
||||||
|
|
||||||
|
// 9 Shavite
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
// 10 Simd
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
// 11 Echo
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
|
||||||
|
// 12 Hamsi parallel 4way 64 bit
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 13 Fugue
|
||||||
|
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||||
|
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
|
||||||
|
|
||||||
|
// 14 Shabal, parallel 32 bit
|
||||||
|
intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
shabal512_8way_update( &ctx.shabal, vhash, 64 );
|
||||||
|
shabal512_8way_close( &ctx.shabal, vhash );
|
||||||
|
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 15 Whirlpool
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash4, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash4 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash5, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash5 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash6, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash6 );
|
||||||
|
memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
|
||||||
|
sizeof(sph_whirlpool_context) );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, hash7, 64 );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash7 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x15_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( ( hash+(i<<3) )[7] < Htarg )
|
||||||
|
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash, mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(X15_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()
|
|||||||
|
|
||||||
void x15_4way_hash( void *state, const void *input )
|
void x15_4way_hash( void *state, const void *input )
|
||||||
{
|
{
|
||||||
|
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
|
||||||
x15_4way_ctx_holder ctx;
|
x15_4way_ctx_holder ctx;
|
||||||
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
|
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
|
||||||
|
|
||||||
@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )
|
|||||||
|
|
||||||
// 14 Shabal, parallel 32 bit
|
// 14 Shabal, parallel 32 bit
|
||||||
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
shabal512_4way_update( &ctx.shabal, vhash, 64 );
|
||||||
shabal512_4way_close( &ctx.shabal, vhash );
|
shabal512_4way_close( &ctx.shabal, vhash );
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
|
||||||
@@ -216,48 +517,37 @@ void x15_4way_hash( void *state, const void *input )
|
|||||||
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
uint32_t n = first_nonce;
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
|
__m256i *noncev = (__m256i*)vdata + 9;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id;
|
||||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
|
||||||
0xFFF, 0xFFFF, 0x10000000 };
|
|
||||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
|
||||||
0xFFFFF000, 0xFFFF0000, 0 };
|
|
||||||
|
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
|
|
||||||
for ( int m=0; m < 6; m++ )
|
do
|
||||||
if ( Htarg <= htmax[m] )
|
{
|
||||||
{
|
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||||
uint32_t mask = masks[m];
|
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
do
|
|
||||||
|
x15_4way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 4; i++ )
|
||||||
|
if ( ( hash+(i<<3) )[7] < Htarg )
|
||||||
|
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
{
|
{
|
||||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
pdata[19] = n+i;
|
||||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
submit_lane_solution( work, hash, mythr, i );
|
||||||
|
}
|
||||||
|
n += 4;
|
||||||
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
|
||||||
x15_4way_hash( hash, vdata );
|
*hashes_done = n - first_nonce;
|
||||||
pdata[19] = n;
|
|
||||||
|
|
||||||
for ( int i = 0; i < 4; i++ )
|
|
||||||
if ( ( (hash+(i<<3))[7] & mask ) == 0 )
|
|
||||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
|
||||||
{
|
|
||||||
pdata[19] = n+i;
|
|
||||||
submit_lane_solution( work, hash, mythr, i );
|
|
||||||
}
|
|
||||||
n += 4;
|
|
||||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
*hashes_done = n - first_nonce + 1;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_x15_algo( algo_gate_t* gate )
|
bool register_x15_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X15_4WAY)
|
#if defined (X15_8WAY)
|
||||||
|
init_x15_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x15_8way;
|
||||||
|
gate->hash = (void*)&x15_8way_hash;
|
||||||
|
#elif defined (X15_4WAY)
|
||||||
init_x15_4way_ctx();
|
init_x15_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x15_4way;
|
gate->scanhash = (void*)&scanhash_x15_4way;
|
||||||
gate->hash = (void*)&x15_4way_hash;
|
gate->hash = (void*)&x15_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x15;
|
gate->scanhash = (void*)&scanhash_x15;
|
||||||
gate->hash = (void*)&x15hash;
|
gate->hash = (void*)&x15hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,20 +4,30 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X15_4WAY
|
#define X15_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X15_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
bool register_x15_algo( algo_gate_t* gate );
|
bool register_x15_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X15_4WAY)
|
#if defined(X15_8WAY)
|
||||||
|
|
||||||
|
void x15_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x15_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X15_4WAY)
|
||||||
|
|
||||||
void x15_4way_hash( void *state, const void *input );
|
void x15_4way_hash( void *state, const void *input );
|
||||||
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
void init_x15_4way_ctx();
|
void init_x15_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x15hash( void *state, const void *input );
|
void x15hash( void *state, const void *input );
|
||||||
int scanhash_x15( struct work *work, uint32_t max_nonce,
|
int scanhash_x15( struct work *work, uint32_t max_nonce,
|
||||||
@@ -26,3 +36,5 @@ void init_x15_ctx();
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -5,9 +5,6 @@
|
|||||||
* Optimized by JayDDee@github Jan 2018
|
* Optimized by JayDDee@github Jan 2018
|
||||||
*/
|
*/
|
||||||
#include "x16r-gate.h"
|
#include "x16r-gate.h"
|
||||||
|
|
||||||
#if defined (X16R_4WAY)
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -20,6 +17,7 @@
|
|||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
@@ -32,6 +30,392 @@
|
|||||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||||
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
||||||
|
|
||||||
|
#if defined (X16R_8WAY)
|
||||||
|
|
||||||
|
union _x16r_8way_context_overlay
|
||||||
|
{
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
shabal512_8way_context shabal;
|
||||||
|
sph_whirlpool_context whirlpool;
|
||||||
|
sha512_8way_context sha512;
|
||||||
|
} __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
|
||||||
|
|
||||||
|
void x16r_8way_hash( void* output, const void* input )
|
||||||
|
{
|
||||||
|
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash4[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash5[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash6[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash7[24] __attribute__ ((aligned (64)));
|
||||||
|
x16r_8way_context_overlay ctx;
|
||||||
|
void *in0 = (void*) hash0;
|
||||||
|
void *in1 = (void*) hash1;
|
||||||
|
void *in2 = (void*) hash2;
|
||||||
|
void *in3 = (void*) hash3;
|
||||||
|
void *in4 = (void*) hash4;
|
||||||
|
void *in5 = (void*) hash5;
|
||||||
|
void *in6 = (void*) hash6;
|
||||||
|
void *in7 = (void*) hash7;
|
||||||
|
int size = 80;
|
||||||
|
|
||||||
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
input, 640 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 16; i++ )
|
||||||
|
{
|
||||||
|
const char elem = hashOrder[i];
|
||||||
|
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||||
|
|
||||||
|
switch ( algo )
|
||||||
|
{
|
||||||
|
case BLAKE:
|
||||||
|
blake512_8way_init( &ctx.blake );
|
||||||
|
if ( i == 0 )
|
||||||
|
blake512_8way_update( &ctx.blake, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
blake512_8way_update( &ctx.blake, vhash, size );
|
||||||
|
}
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case BMW:
|
||||||
|
bmw512_8way_init( &ctx.bmw );
|
||||||
|
if ( i == 0 )
|
||||||
|
bmw512_8way_update( &ctx.bmw, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||||
|
}
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case GROESTL:
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||||
|
(const char*)in0, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||||
|
(const char*)in1, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||||
|
(const char*)in2, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||||
|
(const char*)in3, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4,
|
||||||
|
(const char*)in4, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5,
|
||||||
|
(const char*)in5, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6,
|
||||||
|
(const char*)in6, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7,
|
||||||
|
(const char*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case SKEIN:
|
||||||
|
skein512_8way_init( &ctx.skein );
|
||||||
|
if ( i == 0 )
|
||||||
|
skein512_8way_update( &ctx.skein, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, size );
|
||||||
|
}
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case JH:
|
||||||
|
jh512_8way_init( &ctx.jh );
|
||||||
|
if ( i == 0 )
|
||||||
|
jh512_8way_update( &ctx.jh, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, size );
|
||||||
|
}
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case KECCAK:
|
||||||
|
keccak512_8way_init( &ctx.keccak );
|
||||||
|
if ( i == 0 )
|
||||||
|
keccak512_8way_update( &ctx.keccak, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, size );
|
||||||
|
}
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case LUFFA:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case CUBEHASH:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case SHAVITE:
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in0, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in1, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in2, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in3, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in4, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in5, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in6, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in7, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
break;
|
||||||
|
case SIMD:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case ECHO:
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence*)in0, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence*)in1, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence*)in2, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence*)in3, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence*)in4, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence*)in5, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence*)in6, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case HAMSI:
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
|
||||||
|
hamsi512_8way_init( &ctx.hamsi );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, size );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case FUGUE:
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in0, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in1, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in2, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in3, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in4, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in5, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in6, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in7, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
break;
|
||||||
|
case SHABAL:
|
||||||
|
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
shabal512_8way_init( &ctx.shabal );
|
||||||
|
shabal512_8way_update( &ctx.shabal, vhash, size );
|
||||||
|
shabal512_8way_close( &ctx.shabal, vhash );
|
||||||
|
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case WHIRLPOOL:
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in0, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in1, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in2, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in3, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in4, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash4 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in5, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash5 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in6, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash6 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in7, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash7 );
|
||||||
|
break;
|
||||||
|
case SHA_512:
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
sha512_8way_init( &ctx.sha512 );
|
||||||
|
sha512_8way_update( &ctx.sha512, vhash, size );
|
||||||
|
sha512_8way_close( &ctx.sha512, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
size = 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy( output, hash0, 32 );
|
||||||
|
memcpy( output+32, hash1, 32 );
|
||||||
|
memcpy( output+64, hash2, 32 );
|
||||||
|
memcpy( output+96, hash3, 32 );
|
||||||
|
memcpy( output+128, hash4, 32 );
|
||||||
|
memcpy( output+160, hash5, 32 );
|
||||||
|
memcpy( output+192, hash6, 32 );
|
||||||
|
memcpy( output+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr)
|
||||||
|
{
|
||||||
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
if ( opt_benchmark )
|
||||||
|
ptarget[7] = 0x0cff;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
bedata1[0] = bswap_32( pdata[1] );
|
||||||
|
bedata1[1] = bswap_32( pdata[2] );
|
||||||
|
const uint32_t ntime = bswap_32( pdata[17] );
|
||||||
|
if ( s_ntime != ntime )
|
||||||
|
{
|
||||||
|
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
|
||||||
|
s_ntime = ntime;
|
||||||
|
if ( opt_debug && !thr_id )
|
||||||
|
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
|
||||||
|
}
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x16r_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
|
||||||
|
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
|
|
||||||
union _x16r_4way_context_overlay
|
union _x16r_4way_context_overlay
|
||||||
{
|
{
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
|
|||||||
shabal512_4way_context shabal;
|
shabal512_4way_context shabal;
|
||||||
sph_whirlpool_context whirlpool;
|
sph_whirlpool_context whirlpool;
|
||||||
sha512_4way_context sha512;
|
sha512_4way_context sha512;
|
||||||
};
|
} __attribute__ ((aligned (64)));
|
||||||
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
|
typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
|
||||||
|
|
||||||
void x16r_4way_hash( void* output, const void* input )
|
void x16r_4way_hash( void* output, const void* input )
|
||||||
{
|
{
|
||||||
|
uint32_t vhash[24*4] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
||||||
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
||||||
uint32_t vhash[24*4] __attribute__ ((aligned (64)));
|
|
||||||
x16r_4way_context_overlay ctx;
|
x16r_4way_context_overlay ctx;
|
||||||
void *in0 = (void*) hash0;
|
void *in0 = (void*) hash0;
|
||||||
void *in1 = (void*) hash1;
|
void *in1 = (void*) hash1;
|
||||||
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
blake512_4way( &ctx.blake, vhash, size );
|
blake512_4way( &ctx.blake, vhash, size );
|
||||||
}
|
}
|
||||||
blake512_4way_close( &ctx.blake, vhash );
|
blake512_4way_close( &ctx.blake, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case BMW:
|
case BMW:
|
||||||
bmw512_4way_init( &ctx.bmw );
|
bmw512_4way_init( &ctx.bmw );
|
||||||
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
bmw512_4way( &ctx.bmw, vhash, size );
|
bmw512_4way( &ctx.bmw, vhash, size );
|
||||||
}
|
}
|
||||||
bmw512_4way_close( &ctx.bmw, vhash );
|
bmw512_4way_close( &ctx.bmw, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case GROESTL:
|
case GROESTL:
|
||||||
init_groestl( &ctx.groestl, 64 );
|
init_groestl( &ctx.groestl, 64 );
|
||||||
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
skein512_4way( &ctx.skein, vhash, size );
|
skein512_4way( &ctx.skein, vhash, size );
|
||||||
}
|
}
|
||||||
skein512_4way_close( &ctx.skein, vhash );
|
skein512_4way_close( &ctx.skein, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case JH:
|
case JH:
|
||||||
jh512_4way_init( &ctx.jh );
|
jh512_4way_init( &ctx.jh );
|
||||||
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
jh512_4way( &ctx.jh, vhash, size );
|
jh512_4way( &ctx.jh, vhash, size );
|
||||||
}
|
}
|
||||||
jh512_4way_close( &ctx.jh, vhash );
|
jh512_4way_close( &ctx.jh, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case KECCAK:
|
case KECCAK:
|
||||||
keccak512_4way_init( &ctx.keccak );
|
keccak512_4way_init( &ctx.keccak );
|
||||||
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
keccak512_4way( &ctx.keccak, vhash, size );
|
keccak512_4way( &ctx.keccak, vhash, size );
|
||||||
}
|
}
|
||||||
keccak512_4way_close( &ctx.keccak, vhash );
|
keccak512_4way_close( &ctx.keccak, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case LUFFA:
|
case LUFFA:
|
||||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||||
luffa_2way_init( &ctx.luffa, 512 );
|
luffa_2way_init( &ctx.luffa, 512 );
|
||||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
|
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
|
||||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||||
luffa_2way_init( &ctx.luffa, 512 );
|
luffa_2way_init( &ctx.luffa, 512 );
|
||||||
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
|
luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
|
||||||
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case CUBEHASH:
|
case CUBEHASH:
|
||||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||||
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||||
simd_2way_init( &ctx.simd, 512 );
|
simd_2way_init( &ctx.simd, 512 );
|
||||||
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||||
simd_2way_init( &ctx.simd, 512 );
|
simd_2way_init( &ctx.simd, 512 );
|
||||||
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case ECHO:
|
case ECHO:
|
||||||
init_echo( &ctx.echo, 512 );
|
init_echo( &ctx.echo, 512 );
|
||||||
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
hamsi512_4way_init( &ctx.hamsi );
|
hamsi512_4way_init( &ctx.hamsi );
|
||||||
hamsi512_4way( &ctx.hamsi, vhash, size );
|
hamsi512_4way( &ctx.hamsi, vhash, size );
|
||||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case FUGUE:
|
case FUGUE:
|
||||||
sph_fugue512_init( &ctx.fugue );
|
sph_fugue512_init( &ctx.fugue );
|
||||||
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
shabal512_4way_init( &ctx.shabal );
|
shabal512_4way_init( &ctx.shabal );
|
||||||
shabal512_4way( &ctx.shabal, vhash, size );
|
shabal512_4way( &ctx.shabal, vhash, size );
|
||||||
shabal512_4way_close( &ctx.shabal, vhash );
|
shabal512_4way_close( &ctx.shabal, vhash );
|
||||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
case WHIRLPOOL:
|
case WHIRLPOOL:
|
||||||
sph_whirlpool_init( &ctx.whirlpool );
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
|
|||||||
sha512_4way_init( &ctx.sha512 );
|
sha512_4way_init( &ctx.sha512 );
|
||||||
sha512_4way( &ctx.sha512, vhash, size );
|
sha512_4way( &ctx.sha512, vhash, size );
|
||||||
sha512_4way_close( &ctx.sha512, vhash );
|
sha512_4way_close( &ctx.sha512, vhash );
|
||||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
size = 64;
|
size = 64;
|
||||||
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
uint32_t n = first_nonce;
|
uint32_t n = first_nonce;
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||||
int thr_id = mythr->id;
|
int thr_id = mythr->id;
|
||||||
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
|||||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
}
|
}
|
||||||
n += 4;
|
n += 4;
|
||||||
} while ( likely( ( n < max_nonce ) && !(*restart) ) );
|
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||||
|
|
||||||
*hashes_done = n - first_nonce + 1;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
|
|||||||
|
|
||||||
bool register_x16r_algo( algo_gate_t* gate )
|
bool register_x16r_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X16R_4WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||||
|
gate->hash = (void*)&x16r_8way_hash;
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||||
gate->hash = (void*)&x16r_4way_hash;
|
gate->hash = (void*)&x16r_4way_hash;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_x16r;
|
gate->scanhash = (void*)&scanhash_x16r;
|
||||||
gate->hash = (void*)&x16r_hash;
|
gate->hash = (void*)&x16r_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
return true;
|
return true;
|
||||||
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )
|
|||||||
|
|
||||||
bool register_x16rv2_algo( algo_gate_t* gate )
|
bool register_x16rv2_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X16R_4WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_x16rv2_8way;
|
||||||
|
gate->hash = (void*)&x16rv2_8way_hash;
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_x16rv2_4way;
|
gate->scanhash = (void*)&scanhash_x16rv2_4way;
|
||||||
gate->hash = (void*)&x16rv2_4way_hash;
|
gate->hash = (void*)&x16rv2_4way_hash;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_x16rv2;
|
gate->scanhash = (void*)&scanhash_x16rv2;
|
||||||
gate->hash = (void*)&x16rv2_hash;
|
gate->hash = (void*)&x16rv2_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
return true;
|
return true;
|
||||||
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )
|
|||||||
|
|
||||||
bool register_x16s_algo( algo_gate_t* gate )
|
bool register_x16s_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X16R_4WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_x16r_8way;
|
||||||
|
gate->hash = (void*)&x16r_8way_hash;
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_x16r_4way;
|
gate->scanhash = (void*)&scanhash_x16r_4way;
|
||||||
gate->hash = (void*)&x16r_4way_hash;
|
gate->hash = (void*)&x16r_4way_hash;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_x16r;
|
gate->scanhash = (void*)&scanhash_x16r;
|
||||||
gate->hash = (void*)&x16r_hash;
|
gate->hash = (void*)&x16r_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
return true;
|
return true;
|
||||||
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
|||||||
|
|
||||||
bool register_x16rt_algo( algo_gate_t* gate )
|
bool register_x16rt_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X16R_4WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||||
|
gate->hash = (void*)&x16rt_8way_hash;
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||||
gate->hash = (void*)&x16rt_4way_hash;
|
gate->hash = (void*)&x16rt_4way_hash;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_x16rt;
|
gate->scanhash = (void*)&scanhash_x16rt;
|
||||||
gate->hash = (void*)&x16rt_hash;
|
gate->hash = (void*)&x16rt_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool register_x16rt_veil_algo( algo_gate_t* gate )
|
bool register_x16rt_veil_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X16R_4WAY)
|
#if defined (X16R_8WAY)
|
||||||
|
gate->scanhash = (void*)&scanhash_x16rt_8way;
|
||||||
|
gate->hash = (void*)&x16rt_8way_hash;
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
gate->scanhash = (void*)&scanhash_x16rt_4way;
|
||||||
gate->hash = (void*)&x16rt_4way_hash;
|
gate->hash = (void*)&x16rt_4way_hash;
|
||||||
#else
|
#else
|
||||||
gate->scanhash = (void*)&scanhash_x16rt;
|
gate->scanhash = (void*)&scanhash_x16rt;
|
||||||
gate->hash = (void*)&x16rt_hash;
|
gate->hash = (void*)&x16rt_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
gate->build_extraheader = (void*)&veil_build_extraheader;
|
gate->build_extraheader = (void*)&veil_build_extraheader;
|
||||||
opt_target_factor = 256.0;
|
opt_target_factor = 256.0;
|
||||||
return true;
|
return true;
|
||||||
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
|
|||||||
{
|
{
|
||||||
gate->scanhash = (void*)&scanhash_hex;
|
gate->scanhash = (void*)&scanhash_hex;
|
||||||
gate->hash = (void*)&hex_hash;
|
gate->hash = (void*)&hex_hash;
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||||
opt_target_factor = 128.0;
|
opt_target_factor = 128.0;
|
||||||
return true;
|
return true;
|
||||||
|
@@ -6,8 +6,10 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X16R_4WAY
|
#define X16R_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X16R_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
enum x16r_Algo {
|
enum x16r_Algo {
|
||||||
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
|
|||||||
bool register_hex__algo( algo_gate_t* gate );
|
bool register_hex__algo( algo_gate_t* gate );
|
||||||
bool register_x21s__algo( algo_gate_t* gate );
|
bool register_x21s__algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X16R_4WAY)
|
#if defined(X16R_8WAY)
|
||||||
|
|
||||||
|
void x16r_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
void x16rv2_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void x16rt_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#elif defined(X16R_4WAY)
|
||||||
|
|
||||||
void x16r_4way_hash( void *state, const void *input );
|
void x16r_4way_hash( void *state, const void *input );
|
||||||
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
|
||||||
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
|
|||||||
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void x21s_4way_hash( void *state, const void *input );
|
#else
|
||||||
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
|
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
|
||||||
bool x21s_4way_thread_init();
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void x16r_hash( void *state, const void *input );
|
void x16r_hash( void *state, const void *input );
|
||||||
int scanhash_x16r( struct work *work, uint32_t max_nonce,
|
int scanhash_x16r( struct work *work, uint32_t max_nonce,
|
||||||
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
|
|||||||
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
int scanhash_x16rt( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void hex_hash( void *state, const void *input );
|
#endif
|
||||||
int scanhash_hex( struct work *work, uint32_t max_nonce,
|
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
#if defined(X16R_4WAY)
|
||||||
|
|
||||||
|
void x21s_4way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
bool x21s_4way_thread_init();
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
void x21s_hash( void *state, const void *input );
|
void x21s_hash( void *state, const void *input );
|
||||||
int scanhash_x21s( struct work *work, uint32_t max_nonce,
|
int scanhash_x21s( struct work *work, uint32_t max_nonce,
|
||||||
@@ -88,3 +105,9 @@ bool x21s_thread_init();
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void hex_hash( void *state, const void *input );
|
||||||
|
int scanhash_hex( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "x16r-gate.h"
|
#include "x16r-gate.h"
|
||||||
|
|
||||||
#if defined (X16R_4WAY)
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -15,6 +12,7 @@
|
|||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||||
@@ -26,6 +24,391 @@
|
|||||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||||
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
||||||
|
|
||||||
|
#if defined (X16R_8WAY)
|
||||||
|
|
||||||
|
union _x16rt_8way_context_overlay
|
||||||
|
{
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
shabal512_8way_context shabal;
|
||||||
|
sph_whirlpool_context whirlpool;
|
||||||
|
sha512_8way_context sha512;
|
||||||
|
} __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
|
||||||
|
|
||||||
|
void x16rt_8way_hash( void* output, const void* input )
|
||||||
|
{
|
||||||
|
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash4[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash5[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash6[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash7[24] __attribute__ ((aligned (64)));
|
||||||
|
x16rt_8way_context_overlay ctx;
|
||||||
|
void *in0 = (void*) hash0;
|
||||||
|
void *in1 = (void*) hash1;
|
||||||
|
void *in2 = (void*) hash2;
|
||||||
|
void *in3 = (void*) hash3;
|
||||||
|
void *in4 = (void*) hash4;
|
||||||
|
void *in5 = (void*) hash5;
|
||||||
|
void *in6 = (void*) hash6;
|
||||||
|
void *in7 = (void*) hash7;
|
||||||
|
int size = 80;
|
||||||
|
|
||||||
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
input, 640 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 16; i++ )
|
||||||
|
{
|
||||||
|
const char elem = hashOrder[i];
|
||||||
|
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||||
|
|
||||||
|
switch ( algo )
|
||||||
|
{
|
||||||
|
case BLAKE:
|
||||||
|
blake512_8way_init( &ctx.blake );
|
||||||
|
if ( i == 0 )
|
||||||
|
blake512_8way_update( &ctx.blake, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
blake512_8way_update( &ctx.blake, vhash, size );
|
||||||
|
}
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case BMW:
|
||||||
|
bmw512_8way_init( &ctx.bmw );
|
||||||
|
if ( i == 0 )
|
||||||
|
bmw512_8way_update( &ctx.bmw, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||||
|
}
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case GROESTL:
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||||
|
(const char*)in0, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||||
|
(const char*)in1, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||||
|
(const char*)in2, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||||
|
(const char*)in3, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4,
|
||||||
|
(const char*)in4, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5,
|
||||||
|
(const char*)in5, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6,
|
||||||
|
(const char*)in6, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7,
|
||||||
|
(const char*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case SKEIN:
|
||||||
|
skein512_8way_init( &ctx.skein );
|
||||||
|
if ( i == 0 )
|
||||||
|
skein512_8way_update( &ctx.skein, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, size );
|
||||||
|
}
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case JH:
|
||||||
|
jh512_8way_init( &ctx.jh );
|
||||||
|
if ( i == 0 )
|
||||||
|
jh512_8way_update( &ctx.jh, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, size );
|
||||||
|
}
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case KECCAK:
|
||||||
|
keccak512_8way_init( &ctx.keccak );
|
||||||
|
if ( i == 0 )
|
||||||
|
keccak512_8way_update( &ctx.keccak, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, size );
|
||||||
|
}
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case LUFFA:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case CUBEHASH:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case SHAVITE:
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in0, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in1, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in2, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in3, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in4, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in5, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in6, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in7, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
break;
|
||||||
|
case SIMD:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case ECHO:
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence*)in0, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence*)in1, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence*)in2, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence*)in3, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence*)in4, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence*)in5, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence*)in6, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case HAMSI:
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
|
||||||
|
hamsi512_8way_init( &ctx.hamsi );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, size );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case FUGUE:
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in0, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in1, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in2, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in3, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in4, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in5, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in6, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in7, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
break;
|
||||||
|
case SHABAL:
|
||||||
|
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
shabal512_8way_init( &ctx.shabal );
|
||||||
|
shabal512_8way_update( &ctx.shabal, vhash, size );
|
||||||
|
shabal512_8way_close( &ctx.shabal, vhash );
|
||||||
|
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case WHIRLPOOL:
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in0, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in1, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in2, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in3, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in4, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash4 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in5, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash5 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in6, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash6 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in7, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash7 );
|
||||||
|
break;
|
||||||
|
case SHA_512:
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
sha512_8way_init( &ctx.sha512 );
|
||||||
|
sha512_8way_update( &ctx.sha512, vhash, size );
|
||||||
|
sha512_8way_close( &ctx.sha512, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
size = 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy( output, hash0, 32 );
|
||||||
|
memcpy( output+32, hash1, 32 );
|
||||||
|
memcpy( output+64, hash2, 32 );
|
||||||
|
memcpy( output+96, hash3, 32 );
|
||||||
|
memcpy( output+128, hash4, 32 );
|
||||||
|
memcpy( output+160, hash5, 32 );
|
||||||
|
memcpy( output+192, hash6, 32 );
|
||||||
|
memcpy( output+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr)
|
||||||
|
{
|
||||||
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t _ALIGN(64) timeHash[8*8];
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
if ( opt_benchmark )
|
||||||
|
ptarget[7] = 0x0cff;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
uint32_t ntime = bswap_32( pdata[17] );
|
||||||
|
if ( s_ntime != ntime )
|
||||||
|
{
|
||||||
|
x16rt_getTimeHash( ntime, &timeHash );
|
||||||
|
x16rt_getAlgoString( &timeHash[0], hashOrder );
|
||||||
|
s_ntime = ntime;
|
||||||
|
if ( opt_debug && !thr_id )
|
||||||
|
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
|
||||||
|
hashOrder, ntime, timeHash );
|
||||||
|
}
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x16rt_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
|
||||||
|
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
|
|
||||||
union _x16rt_4way_context_overlay
|
union _x16rt_4way_context_overlay
|
||||||
{
|
{
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
|
@@ -5,9 +5,6 @@
|
|||||||
* Optimized by JayDDee@github Jan 2018
|
* Optimized by JayDDee@github Jan 2018
|
||||||
*/
|
*/
|
||||||
#include "x16r-gate.h"
|
#include "x16r-gate.h"
|
||||||
|
|
||||||
#if defined (X16R_4WAY)
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -21,6 +18,7 @@
|
|||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||||
@@ -33,6 +31,477 @@
|
|||||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||||
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
|
||||||
|
|
||||||
|
#if defined (X16R_8WAY)
|
||||||
|
|
||||||
|
union _x16rv2_8way_context_overlay
|
||||||
|
{
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
hamsi512_8way_context hamsi;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
shabal512_8way_context shabal;
|
||||||
|
sph_whirlpool_context whirlpool;
|
||||||
|
sha512_8way_context sha512;
|
||||||
|
sph_tiger_context tiger;
|
||||||
|
} __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
|
||||||
|
|
||||||
|
void x16rv2_8way_hash( void* output, const void* input )
|
||||||
|
{
|
||||||
|
uint32_t vhash[24*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t hash0[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash1[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash2[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash3[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash4[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash5[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash6[24] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t hash7[24] __attribute__ ((aligned (64)));
|
||||||
|
x16rv2_8way_context_overlay ctx;
|
||||||
|
void *in0 = (void*) hash0;
|
||||||
|
void *in1 = (void*) hash1;
|
||||||
|
void *in2 = (void*) hash2;
|
||||||
|
void *in3 = (void*) hash3;
|
||||||
|
void *in4 = (void*) hash4;
|
||||||
|
void *in5 = (void*) hash5;
|
||||||
|
void *in6 = (void*) hash6;
|
||||||
|
void *in7 = (void*) hash7;
|
||||||
|
int size = 80;
|
||||||
|
|
||||||
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
input, 640 );
|
||||||
|
|
||||||
|
for ( int i = 0; i < 16; i++ )
|
||||||
|
{
|
||||||
|
const char elem = hashOrder[i];
|
||||||
|
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||||
|
|
||||||
|
switch ( algo )
|
||||||
|
{
|
||||||
|
case BLAKE:
|
||||||
|
blake512_8way_init( &ctx.blake );
|
||||||
|
if ( i == 0 )
|
||||||
|
blake512_8way_update( &ctx.blake, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
blake512_8way_update( &ctx.blake, vhash, size );
|
||||||
|
}
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case BMW:
|
||||||
|
bmw512_8way_init( &ctx.bmw );
|
||||||
|
if ( i == 0 )
|
||||||
|
bmw512_8way_update( &ctx.bmw, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, size );
|
||||||
|
}
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case GROESTL:
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||||
|
(const char*)in0, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||||
|
(const char*)in1, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||||
|
(const char*)in2, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||||
|
(const char*)in3, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4,
|
||||||
|
(const char*)in4, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5,
|
||||||
|
(const char*)in5, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6,
|
||||||
|
(const char*)in6, size<<3 );
|
||||||
|
init_groestl( &ctx.groestl, 64 );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7,
|
||||||
|
(const char*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case SKEIN:
|
||||||
|
skein512_8way_init( &ctx.skein );
|
||||||
|
if ( i == 0 )
|
||||||
|
skein512_8way_update( &ctx.skein, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, size );
|
||||||
|
}
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case JH:
|
||||||
|
jh512_8way_init( &ctx.jh );
|
||||||
|
if ( i == 0 )
|
||||||
|
jh512_8way_update( &ctx.jh, input, size );
|
||||||
|
else
|
||||||
|
{
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, size );
|
||||||
|
}
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case KECCAK:
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in0, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash0 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in1, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash1 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in2, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash2 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in3, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash3 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in4, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash4 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in5, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash5 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in6, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash6 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in7, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash7 );
|
||||||
|
|
||||||
|
for ( int i = (24/4); i < (64/4); i++ )
|
||||||
|
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
|
||||||
|
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||||
|
hash6, hash7 );
|
||||||
|
keccak512_8way_init( &ctx.keccak );
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case LUFFA:
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in0, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash0 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in1, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash1 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in2, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash2 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in3, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash3 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in4, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash4 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in5, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash5 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in6, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash6 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in7, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash7 );
|
||||||
|
|
||||||
|
for ( int i = (24/4); i < (64/4); i++ )
|
||||||
|
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
|
||||||
|
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case CUBEHASH:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case SHAVITE:
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in0, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in1, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in2, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in3, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in4, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in5, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in6, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
sph_shavite512_init( &ctx.shavite );
|
||||||
|
sph_shavite512( &ctx.shavite, in7, size );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
break;
|
||||||
|
case SIMD:
|
||||||
|
intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
break;
|
||||||
|
case ECHO:
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence*)in0, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence*)in1, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence*)in2, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence*)in3, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence*)in4, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence*)in5, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence*)in6, size<<3 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo ( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence*)in7, size<<3 );
|
||||||
|
break;
|
||||||
|
case HAMSI:
|
||||||
|
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
|
||||||
|
hamsi512_8way_init( &ctx.hamsi );
|
||||||
|
hamsi512_8way_update( &ctx.hamsi, vhash, size );
|
||||||
|
hamsi512_8way_close( &ctx.hamsi, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case FUGUE:
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in0, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in1, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in2, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in3, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in4, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in5, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in6, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, in7, size );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
break;
|
||||||
|
case SHABAL:
|
||||||
|
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
|
||||||
|
size<<3 );
|
||||||
|
shabal512_8way_init( &ctx.shabal );
|
||||||
|
shabal512_8way_update( &ctx.shabal, vhash, size );
|
||||||
|
shabal512_8way_close( &ctx.shabal, vhash );
|
||||||
|
dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
case WHIRLPOOL:
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in0, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in1, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash1 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in2, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash2 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in3, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash3 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in4, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash4 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in5, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash5 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in6, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash6 );
|
||||||
|
sph_whirlpool_init( &ctx.whirlpool );
|
||||||
|
sph_whirlpool( &ctx.whirlpool, in7, size );
|
||||||
|
sph_whirlpool_close( &ctx.whirlpool, hash7 );
|
||||||
|
break;
|
||||||
|
case SHA_512:
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in0, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash0 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in1, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash1 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in2, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash2 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in3, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash3 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in4, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash4 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in5, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash5 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in6, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash6 );
|
||||||
|
sph_tiger_init( &ctx.tiger );
|
||||||
|
sph_tiger( &ctx.tiger, in7, size );
|
||||||
|
sph_tiger_close( &ctx.tiger, hash7 );
|
||||||
|
|
||||||
|
for ( int i = (24/4); i < (64/4); i++ )
|
||||||
|
hash0[i] = hash1[i] = hash2[i] = hash3[i] =
|
||||||
|
hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
|
||||||
|
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
|
||||||
|
hash6, hash7 );
|
||||||
|
sha512_8way_init( &ctx.sha512 );
|
||||||
|
sha512_8way_update( &ctx.sha512, vhash, 64 );
|
||||||
|
sha512_8way_close( &ctx.sha512, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
size = 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy( output, hash0, 32 );
|
||||||
|
memcpy( output+32, hash1, 32 );
|
||||||
|
memcpy( output+64, hash2, 32 );
|
||||||
|
memcpy( output+96, hash3, 32 );
|
||||||
|
memcpy( output+128, hash4, 32 );
|
||||||
|
memcpy( output+160, hash5, 32 );
|
||||||
|
memcpy( output+192, hash6, 32 );
|
||||||
|
memcpy( output+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr)
|
||||||
|
{
|
||||||
|
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t bedata1[2] __attribute__((aligned(64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
if ( opt_benchmark )
|
||||||
|
ptarget[7] = 0x0cff;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
bedata1[0] = bswap_32( pdata[1] );
|
||||||
|
bedata1[1] = bswap_32( pdata[2] );
|
||||||
|
const uint32_t ntime = bswap_32( pdata[17] );
|
||||||
|
if ( s_ntime != ntime )
|
||||||
|
{
|
||||||
|
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
|
||||||
|
s_ntime = ntime;
|
||||||
|
if ( opt_debug && !thr_id )
|
||||||
|
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
|
||||||
|
}
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x16rv2_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
|
||||||
|
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined (X16R_4WAY)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
union _x16rv2_4way_context_overlay
|
union _x16rv2_4way_context_overlay
|
||||||
{
|
{
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
|
@@ -4,6 +4,8 @@
|
|||||||
# during develpment. However the information contained may provide compilation
|
# during develpment. However the information contained may provide compilation
|
||||||
# tips to users.
|
# tips to users.
|
||||||
|
|
||||||
|
rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen
|
||||||
|
|
||||||
make distclean || echo clean
|
make distclean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
./autogen.sh || echo done
|
./autogen.sh || echo done
|
||||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
|
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.4.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='cpuminer-opt'
|
PACKAGE_NAME='cpuminer-opt'
|
||||||
PACKAGE_TARNAME='cpuminer-opt'
|
PACKAGE_TARNAME='cpuminer-opt'
|
||||||
PACKAGE_VERSION='3.10.2'
|
PACKAGE_VERSION='3.10.4'
|
||||||
PACKAGE_STRING='cpuminer-opt 3.10.2'
|
PACKAGE_STRING='cpuminer-opt 3.10.4'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
|
\`configure' configures cpuminer-opt 3.10.4 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@@ -1404,7 +1404,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
|
short | recursive ) echo "Configuration of cpuminer-opt 3.10.4:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@@ -1509,7 +1509,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
cpuminer-opt configure 3.10.2
|
cpuminer-opt configure 3.10.4
|
||||||
generated by GNU Autoconf 2.69
|
generated by GNU Autoconf 2.69
|
||||||
|
|
||||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by cpuminer-opt $as_me 3.10.2, which was
|
It was created by cpuminer-opt $as_me 3.10.4, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@@ -2993,7 +2993,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='cpuminer-opt'
|
PACKAGE='cpuminer-opt'
|
||||||
VERSION='3.10.2'
|
VERSION='3.10.4'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by cpuminer-opt $as_me 3.10.2, which was
|
This file was extended by cpuminer-opt $as_me 3.10.4, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
cpuminer-opt config.status 3.10.2
|
cpuminer-opt config.status 3.10.4
|
||||||
configured by $0, generated by GNU Autoconf 2.69,
|
configured by $0, generated by GNU Autoconf 2.69,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.10.2])
|
AC_INIT([cpuminer-opt], [3.10.4])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
50
cpu-miner.c
50
cpu-miner.c
@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
|
|||||||
printf(".\n");
|
printf(".\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf("CPU features:");
|
printf("CPU features: ");
|
||||||
if ( cpu_has_vaes ) printf( " VAES" );
|
|
||||||
else if ( cpu_has_aes ) printf( " AES" );
|
|
||||||
if ( cpu_has_sha ) printf( " SHA" );
|
|
||||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||||
else if ( cpu_has_avx2 ) printf( " AVX2" );
|
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||||
else if ( cpu_has_avx ) printf( " AVX" );
|
else if ( cpu_has_avx ) printf( " AVX " );
|
||||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||||
else if ( cpu_has_sse2 ) printf( " SSE2" );
|
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||||
|
if ( cpu_has_vaes ) printf( " VAES" );
|
||||||
|
else if ( cpu_has_aes ) printf( " AES" );
|
||||||
|
if ( cpu_has_sha ) printf( " SHA" );
|
||||||
|
|
||||||
printf(".\nSW features:");
|
printf("\nSW features: ");
|
||||||
if ( sw_has_vaes ) printf( " VAES" );
|
|
||||||
else if ( sw_has_aes ) printf( " AES" );
|
|
||||||
if ( sw_has_sha ) printf( " SHA" );
|
|
||||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||||
else if ( sw_has_avx2 ) printf( " AVX2" );
|
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||||
else if ( sw_has_avx ) printf( " AVX" );
|
else if ( sw_has_avx ) printf( " AVX " );
|
||||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||||
else if ( sw_has_sse2 ) printf( " SSE2" );
|
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||||
|
if ( sw_has_vaes ) printf( " VAES" );
|
||||||
|
else if ( sw_has_aes ) printf( " AES " );
|
||||||
|
if ( sw_has_sha ) printf( " SHA" );
|
||||||
|
|
||||||
printf(".\nAlgo features:");
|
printf("\nAlgo features:");
|
||||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if ( algo_has_vaes ) printf( " VAES" );
|
|
||||||
else if ( algo_has_aes ) printf( " AES" );
|
|
||||||
if ( algo_has_sha ) printf( " SHA" );
|
|
||||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||||
else if ( algo_has_avx2 ) printf( " AVX2" );
|
else if ( algo_has_avx2 ) printf( " AVX2 " );
|
||||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||||
else if ( algo_has_sse2 ) printf( " SSE2" );
|
else if ( algo_has_sse2 ) printf( " SSE2 " );
|
||||||
|
if ( algo_has_vaes ) printf( " VAES" );
|
||||||
|
else if ( algo_has_aes ) printf( " AES " );
|
||||||
|
if ( algo_has_sha ) printf( " SHA" );
|
||||||
}
|
}
|
||||||
printf(".\n");
|
printf("\n");
|
||||||
|
|
||||||
// Check for CPU and build incompatibilities
|
// Check for CPU and build incompatibilities
|
||||||
if ( !cpu_has_sse2 )
|
if ( !cpu_has_sse2 )
|
||||||
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
|
|||||||
use_sha || use_vaes );
|
use_sha || use_vaes );
|
||||||
|
|
||||||
// Display best options
|
// Display best options
|
||||||
printf( "Start mining with" );
|
printf( "\nStarting miner with" );
|
||||||
if ( use_none ) printf( " no optimizations" );
|
if ( use_none ) printf( " no optimizations" );
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if ( use_vaes ) printf( " VAES" );
|
|
||||||
else if ( use_aes ) printf( " AES" );
|
|
||||||
if ( use_avx512 ) printf( " AVX512" );
|
if ( use_avx512 ) printf( " AVX512" );
|
||||||
else if ( use_avx2 ) printf( " AVX2" );
|
else if ( use_avx2 ) printf( " AVX2" );
|
||||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||||
else if ( use_sse2 ) printf( " SSE2" );
|
else if ( use_sse2 ) printf( " SSE2" );
|
||||||
|
if ( use_vaes ) printf( " VAES" );
|
||||||
|
else if ( use_aes ) printf( " AES" );
|
||||||
if ( use_sha ) printf( " SHA" );
|
if ( use_sha ) printf( " SHA" );
|
||||||
}
|
}
|
||||||
printf( ".\n\n" );
|
printf( "...\n\n" );
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
|
|||||||
d0[3] = s[6]; d1[3] = s[7];
|
d0[3] = s[6]; d1[3] = s[7];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif // AVX
|
#endif // AVX
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
@@ -2225,7 +2222,6 @@ static inline void rintrlv_4x32_4x64( void *dst,
|
|||||||
|
|
||||||
// 2x128 -> 4x64
|
// 2x128 -> 4x64
|
||||||
|
|
||||||
|
|
||||||
static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
|
static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
|
||||||
const void *src1, const int bit_len )
|
const void *src1, const int bit_len )
|
||||||
{
|
{
|
||||||
@@ -2268,7 +2264,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
|
|||||||
d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
|
d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define RLEAVE_2x128_4x64( i ) do \
|
#define RLEAVE_2x128_4x64( i ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -2339,7 +2334,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
|
|||||||
d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
|
d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define RLEAVE_4x64_2x128( i ) do \
|
#define RLEAVE_4x64_2x128( i ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -2364,6 +2358,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// 2x128 -> 8x64
|
||||||
|
|
||||||
|
static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
|
||||||
|
const void *src1, const int bit_len )
|
||||||
|
{
|
||||||
|
__m128i *d = (__m128i*)dst;
|
||||||
|
const __m128i *s0 = (const __m128i*)src0;
|
||||||
|
const __m128i *s1 = (const __m128i*)src1;
|
||||||
|
|
||||||
|
d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
|
||||||
|
d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
|
||||||
|
d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
|
||||||
|
d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
|
||||||
|
d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
|
||||||
|
d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
|
||||||
|
d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
|
||||||
|
d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
|
||||||
|
|
||||||
|
d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
|
||||||
|
d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
|
||||||
|
d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
|
||||||
|
d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
|
||||||
|
d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
|
||||||
|
d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
|
||||||
|
d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
|
||||||
|
d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
|
||||||
|
|
||||||
|
if ( bit_len <= 256 ) return;
|
||||||
|
|
||||||
|
d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
|
||||||
|
d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
|
||||||
|
d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
|
||||||
|
d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
|
||||||
|
d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
|
||||||
|
d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
|
||||||
|
d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
|
||||||
|
d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
|
||||||
|
|
||||||
|
d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
|
||||||
|
d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
|
||||||
|
d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
|
||||||
|
d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
|
||||||
|
d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
|
||||||
|
d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
|
||||||
|
d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
|
||||||
|
d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
|
||||||
|
|
||||||
|
if ( bit_len <= 512 ) return;
|
||||||
|
|
||||||
|
d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
|
||||||
|
d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
|
||||||
|
d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
|
||||||
|
d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
|
||||||
|
d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
|
||||||
|
d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
|
||||||
|
d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
|
||||||
|
d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
|
||||||
|
|
||||||
|
d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
|
||||||
|
d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
|
||||||
|
d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
|
||||||
|
d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
|
||||||
|
d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
|
||||||
|
d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
|
||||||
|
d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
|
||||||
|
d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
|
||||||
|
|
||||||
|
d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
|
||||||
|
d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
|
||||||
|
d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
|
||||||
|
d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
|
||||||
|
d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
|
||||||
|
d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
|
||||||
|
d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
|
||||||
|
d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
|
||||||
|
|
||||||
|
d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
|
||||||
|
d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
|
||||||
|
d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
|
||||||
|
d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
|
||||||
|
d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
|
||||||
|
d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
|
||||||
|
d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
|
||||||
|
d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8x64 -> 4x128
|
||||||
|
|
||||||
|
static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
|
||||||
|
const void *src, const int bit_len )
|
||||||
|
{
|
||||||
|
__m128i *d0 = (__m128i*)dst0;
|
||||||
|
__m128i *d1 = (__m128i*)dst1;
|
||||||
|
const __m128i* s = (const __m128i*)src;
|
||||||
|
|
||||||
|
d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
|
||||||
|
d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
|
||||||
|
d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
|
||||||
|
d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
|
||||||
|
d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
|
||||||
|
d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
|
||||||
|
d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
|
||||||
|
d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
|
||||||
|
|
||||||
|
d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
|
||||||
|
d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
|
||||||
|
d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
|
||||||
|
d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
|
||||||
|
d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
|
||||||
|
d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
|
||||||
|
d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
|
||||||
|
d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
|
||||||
|
|
||||||
|
if ( bit_len <= 256 ) return;
|
||||||
|
|
||||||
|
d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
|
||||||
|
d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
|
||||||
|
d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
|
||||||
|
d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
|
||||||
|
d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
|
||||||
|
d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
|
||||||
|
d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
|
||||||
|
d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
|
||||||
|
|
||||||
|
d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
|
||||||
|
d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
|
||||||
|
d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
|
||||||
|
d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
|
||||||
|
d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
|
||||||
|
d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
|
||||||
|
d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
|
||||||
|
d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
|
||||||
|
|
||||||
|
if ( bit_len <= 512 ) return;
|
||||||
|
|
||||||
|
d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
|
||||||
|
d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
|
||||||
|
d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
|
||||||
|
d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
|
||||||
|
d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
|
||||||
|
d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
|
||||||
|
d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
|
||||||
|
d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
|
||||||
|
|
||||||
|
d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
|
||||||
|
d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
|
||||||
|
d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
|
||||||
|
d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
|
||||||
|
d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
|
||||||
|
d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
|
||||||
|
d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
|
||||||
|
d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
|
||||||
|
|
||||||
|
d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
|
||||||
|
d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
|
||||||
|
d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
|
||||||
|
d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
|
||||||
|
d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
|
||||||
|
d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
|
||||||
|
d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
|
||||||
|
d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
|
||||||
|
|
||||||
|
d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
|
||||||
|
d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
|
||||||
|
d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
|
||||||
|
d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
|
||||||
|
d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
|
||||||
|
d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
|
||||||
|
d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
|
||||||
|
d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8x64 -> 2x256
|
||||||
|
|
||||||
|
static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
|
||||||
|
void *dst3, const void *src, const int bit_len )
|
||||||
|
{
|
||||||
|
__m128i *d0 = (__m128i*)dst0;
|
||||||
|
__m128i *d1 = (__m128i*)dst1;
|
||||||
|
__m128i *d2 = (__m128i*)dst2;
|
||||||
|
__m128i *d3 = (__m128i*)dst3;
|
||||||
|
const __m128i* s = (const __m128i*)src;
|
||||||
|
|
||||||
|
d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
|
||||||
|
d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
|
||||||
|
d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
|
||||||
|
d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
|
||||||
|
d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
|
||||||
|
d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
|
||||||
|
d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
|
||||||
|
d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
|
||||||
|
|
||||||
|
d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] );
|
||||||
|
d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
|
||||||
|
d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] );
|
||||||
|
d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
|
||||||
|
d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
|
||||||
|
d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
|
||||||
|
d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
|
||||||
|
d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
|
||||||
|
|
||||||
|
if ( bit_len <= 256 ) return;
|
||||||
|
|
||||||
|
d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
|
||||||
|
d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
|
||||||
|
d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
|
||||||
|
d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
|
||||||
|
d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
|
||||||
|
d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
|
||||||
|
d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
|
||||||
|
d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
|
||||||
|
|
||||||
|
d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
|
||||||
|
d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
|
||||||
|
d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
|
||||||
|
d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
|
||||||
|
d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
|
||||||
|
d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
|
||||||
|
d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
|
||||||
|
d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
|
||||||
|
|
||||||
|
if ( bit_len <= 512 ) return;
|
||||||
|
|
||||||
|
d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
|
||||||
|
d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
|
||||||
|
d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
|
||||||
|
d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
|
||||||
|
d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
|
||||||
|
d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
|
||||||
|
d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
|
||||||
|
d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
|
||||||
|
|
||||||
|
d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
|
||||||
|
d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
|
||||||
|
d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
|
||||||
|
d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
|
||||||
|
d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
|
||||||
|
d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
|
||||||
|
d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
|
||||||
|
d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
|
||||||
|
|
||||||
|
d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
|
||||||
|
d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
|
||||||
|
d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
|
||||||
|
d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
|
||||||
|
d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
|
||||||
|
d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
|
||||||
|
d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
|
||||||
|
d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
|
||||||
|
|
||||||
|
d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
|
||||||
|
d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
|
||||||
|
d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
|
||||||
|
d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
|
||||||
|
d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
|
||||||
|
d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
|
||||||
|
d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
|
||||||
|
d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4x128 -> 8x64
|
||||||
|
|
||||||
|
static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
|
||||||
|
const void *src1, const void *src2, const void *src3, const int bit_len )
|
||||||
|
{
|
||||||
|
__m128i *d = (__m128i*)dst;
|
||||||
|
__m128i *s0 = (__m128i*)src0;
|
||||||
|
__m128i *s1 = (__m128i*)src1;
|
||||||
|
__m128i *s2 = (__m128i*)src2;
|
||||||
|
__m128i *s3 = (__m128i*)src3;
|
||||||
|
|
||||||
|
d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
|
||||||
|
d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
|
||||||
|
d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
|
||||||
|
d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
|
||||||
|
d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
|
||||||
|
d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
|
||||||
|
d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
|
||||||
|
d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
|
||||||
|
|
||||||
|
d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
|
||||||
|
d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
|
||||||
|
d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
|
||||||
|
d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
|
||||||
|
d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
|
||||||
|
d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
|
||||||
|
d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
|
||||||
|
d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
|
||||||
|
|
||||||
|
if ( bit_len <= 256 ) return;
|
||||||
|
|
||||||
|
d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
|
||||||
|
d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
|
||||||
|
d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
|
||||||
|
d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
|
||||||
|
d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
|
||||||
|
d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
|
||||||
|
d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
|
||||||
|
d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
|
||||||
|
|
||||||
|
d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
|
||||||
|
d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
|
||||||
|
d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
|
||||||
|
d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
|
||||||
|
d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
|
||||||
|
d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
|
||||||
|
d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
|
||||||
|
d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
|
||||||
|
|
||||||
|
if ( bit_len <= 512 ) return;
|
||||||
|
|
||||||
|
d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
|
||||||
|
d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
|
||||||
|
d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
|
||||||
|
d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
|
||||||
|
d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
|
||||||
|
d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
|
||||||
|
d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
|
||||||
|
d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
|
||||||
|
|
||||||
|
d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
|
||||||
|
d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
|
||||||
|
d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
|
||||||
|
d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
|
||||||
|
d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
|
||||||
|
d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
|
||||||
|
d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
|
||||||
|
d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
|
||||||
|
|
||||||
|
d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
|
||||||
|
d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
|
||||||
|
d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
|
||||||
|
d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
|
||||||
|
d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
|
||||||
|
d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
|
||||||
|
d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
|
||||||
|
d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
|
||||||
|
|
||||||
|
d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
|
||||||
|
d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
|
||||||
|
d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
|
||||||
|
d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
|
||||||
|
d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
|
||||||
|
d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
|
||||||
|
d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
|
||||||
|
d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Some functions customized for mining.
|
// Some functions customized for mining.
|
||||||
|
|
||||||
|
@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
||||||
#define mm128_ror_64 mm128_ror_var_64
|
#define mm128_ror_64 mm128_ror_var_64
|
||||||
#define mm128_rol_64 mm128_rol_var_64
|
#define mm128_rol_64 mm128_rol_var_64
|
||||||
#define mm128_ror_32 mm128_ror_var_32
|
#define mm128_ror_32 mm128_ror_var_32
|
||||||
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||||
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||||
|
|
||||||
|
// Rotate 16 byte (128 bit) vector by c bytes.
|
||||||
|
// Less efficient using shift but more versatile. Use only for odd number
|
||||||
|
// byte rotations. Use shuffle above whenever possible.
|
||||||
|
#define mm128_ror_x8( v, c ) \
|
||||||
|
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
|
||||||
|
|
||||||
|
#define mm128_rol_x8( v, c ) \
|
||||||
|
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
|
||||||
|
|
||||||
#if defined (__SSE3__)
|
#if defined (__SSE3__)
|
||||||
// no SSE2 implementation, no current users
|
// no SSE2 implementation, no current users
|
||||||
|
|
||||||
@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
#define mm128_rol_1x8( v ) \
|
#define mm128_rol_1x8( v ) \
|
||||||
_mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
|
_mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
|
||||||
0x060504030201000f ) )
|
0x060504030201000f ) )
|
||||||
#endif // SSE3
|
#else // SSE2
|
||||||
|
|
||||||
// Rotate 16 byte (128 bit) vector by c bytes.
|
#define mm128_ror_1x16( v ) \
|
||||||
// Less efficient using shift but more versatile. Use only for odd number
|
_mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
|
||||||
// byte rotations. Use shuffle above whenever possible.
|
|
||||||
#define mm128_bror( v, c ) \
|
|
||||||
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
|
|
||||||
|
|
||||||
#define mm128_brol( v, c ) \
|
#define mm128_rol_1x16( v ) \
|
||||||
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
|
_mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
|
||||||
|
|
||||||
|
#define mm128_ror_1x8( v ) \
|
||||||
|
_mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
|
||||||
|
|
||||||
|
#define mm128_rol_1x8( v ) \
|
||||||
|
_mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
|
||||||
|
|
||||||
|
#endif // SSE3 else SSE2
|
||||||
|
|
||||||
// Invert vector: {3,2,1,0} -> {0,1,2,3}
|
// Invert vector: {3,2,1,0} -> {0,1,2,3}
|
||||||
#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
|
#define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
|
||||||
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
//
|
//
|
||||||
// Rotate elements within lanes.
|
// Rotate elements within lanes.
|
||||||
|
|
||||||
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
|
#define mm128_swap_64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||||
|
|
||||||
#define mm128_ror16_64( v ) \
|
#define mm128_rol64_8( v, c ) \
|
||||||
_mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
|
_mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
|
||||||
0x0100070605040302 )
|
_mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
|
||||||
|
|
||||||
#define mm128_rol16_64( v ) \
|
#define mm128_ror64_8( v, c ) \
|
||||||
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
|
_mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
|
||||||
0x0504030201000706 )
|
_mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
|
||||||
|
|
||||||
#define mm128_swap16_32( v ) \
|
#define mm128_rol32_8( v, c ) \
|
||||||
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
|
_mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
|
||||||
0x0504070601000302 )
|
_mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
|
||||||
|
|
||||||
|
#define mm128_ror32_8( v, c ) \
|
||||||
|
_mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
|
||||||
|
_mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Endian byte swap.
|
// Endian byte swap.
|
||||||
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
|||||||
|
|
||||||
// Swap 128 bit vectorse.
|
// Swap 128 bit vectorse.
|
||||||
|
|
||||||
#define mm128_swap128_256( v1, v2 ) \
|
#define mm128_swap256_128( v1, v2 ) \
|
||||||
v1 = _mm_xor_si128( v1, v2 ); \
|
v1 = _mm_xor_si128( v1, v2 ); \
|
||||||
v2 = _mm_xor_si128( v1, v2 ); \
|
v2 = _mm_xor_si128( v1, v2 ); \
|
||||||
v1 = _mm_xor_si128( v1, v2 );
|
v1 = _mm_xor_si128( v1, v2 );
|
||||||
|
|
||||||
|
|
||||||
// Concatenate v1 & v2 and rotate as one 256 bit vector.
|
// Concatenate v1 & v2 and rotate as one 256 bit vector.
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
#define mm128_ror1x64_256( v1, v2 ) \
|
#define mm128_ror256_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||||
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x64_256( v1, v2 ) \
|
#define mm128_rol256_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||||
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x32_256( v1, v2 ) \
|
#define mm128_ror256_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||||
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x32_256( v1, v2 ) \
|
#define mm128_rol256_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
||||||
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x16_256( v1, v2 ) \
|
#define mm128_ror256_16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
||||||
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x16_256( v1, v2 ) \
|
#define mm128_rol256_16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
||||||
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x8_256( v1, v2 ) \
|
#define mm128_ror256_8( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
||||||
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x8_256( v1, v2 ) \
|
#define mm128_rol256_8( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
||||||
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
||||||
@@ -497,7 +515,7 @@ do { \
|
|||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
#define mm128_ror1x64_256( v1, v2 ) \
|
#define mm128_ror256_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||||
_mm_slli_si128( v2, 8 ) ); \
|
_mm_slli_si128( v2, 8 ) ); \
|
||||||
@@ -506,7 +524,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x64_256( v1, v2 ) \
|
#define mm128_rol256_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||||
_mm_srli_si128( v2, 8 ) ); \
|
_mm_srli_si128( v2, 8 ) ); \
|
||||||
@@ -515,7 +533,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x32_256( v1, v2 ) \
|
#define mm128_ror256_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||||
_mm_slli_si128( v2, 12 ) ); \
|
_mm_slli_si128( v2, 12 ) ); \
|
||||||
@@ -524,7 +542,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x32_256( v1, v2 ) \
|
#define mm128_rol256_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||||
_mm_srli_si128( v2, 12 ) ); \
|
_mm_srli_si128( v2, 12 ) ); \
|
||||||
@@ -533,7 +551,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x16_256( v1, v2 ) \
|
#define mm128_ror256_16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||||
_mm_slli_si128( v2, 14 ) ); \
|
_mm_slli_si128( v2, 14 ) ); \
|
||||||
@@ -542,7 +560,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x16_256( v1, v2 ) \
|
#define mm128_rol256_16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||||
_mm_srli_si128( v2, 14 ) ); \
|
_mm_srli_si128( v2, 14 ) ); \
|
||||||
@@ -551,7 +569,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_ror1x8_256( v1, v2 ) \
|
#define mm128_ror256_8( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||||
_mm_slli_si128( v2, 15 ) ); \
|
_mm_slli_si128( v2, 15 ) ); \
|
||||||
@@ -560,7 +578,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm128_rol1x8_256( v1, v2 ) \
|
#define mm128_rol256_8( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||||
_mm_srli_si128( v2, 15 ) ); \
|
_mm_srli_si128( v2, 15 ) ); \
|
||||||
|
@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
|||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Rotate elements within lanes of 256 bit vector.
|
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||||
|
|
||||||
// Swap 64 bit elements in each 128 bit lane.
|
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||||
#define mm256_swap64_128( v ) _mm256_shuffle_epi32( v, 0x4e )
|
|
||||||
|
|
||||||
// Rotate each 128 bit lane by one 32 bit element.
|
#define mm256_ror128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
||||||
#define mm256_ror1x32_128( v ) _mm256_shuffle_epi32( v, 0x39 )
|
|
||||||
#define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 )
|
|
||||||
|
|
||||||
#define mm256_ror1x16_128( v ) \
|
#define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
|
|
||||||
0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
|
|
||||||
|
|
||||||
#define mm256_rol1x16_128( v ) \
|
// Rotave each 128 bit lane by c elements.
|
||||||
_mm256_shuffle_epi8( v, \
|
#define mm256_ror128_8( v, c ) \
|
||||||
m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
|
|
||||||
0x0d0c0b0a09080706, 0x0504030201000f0e ) )
|
|
||||||
|
|
||||||
#define mm256_ror1x8_128( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
|
|
||||||
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
|
|
||||||
|
|
||||||
#define mm256_rol1x8_128( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
|
|
||||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
|
|
||||||
|
|
||||||
// Rotate each 128 bit lane by c bytes.
|
|
||||||
#define mm256_bror_128( v, c ) \
|
|
||||||
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
|
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
|
||||||
_mm256_bslli_epi128( v, 16-(c) ) )
|
_mm256_bslli_epi128( v, 16-(c) ) )
|
||||||
#define mm256_brol_128( v, c ) \
|
#define mm256_rol128_8( v, c ) \
|
||||||
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
|
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
|
||||||
_mm256_bsrli_epi128( v, 16-(c) ) )
|
_mm256_bsrli_epi128( v, 16-(c) ) )
|
||||||
|
|
||||||
// Swap 32 bit elements in each 64 bit lane
|
|
||||||
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
// Rotate elements in each 64 bit lane
|
||||||
|
|
||||||
|
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
#define mm256_rol1x16_64( v ) _mm256_rol_epi64( v, 16 )
|
#define mm256_rol64_8( v, c ) _mm256_rol_epi64( v, ((c)<<3) )
|
||||||
#define mm256_ror1x16_64( v ) _mm256_ror_epi64( v, 16 )
|
#define mm256_ror64_8( v, c ) _mm256_ror_epi64( v, ((c)<<3) )
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define mm256_ror1x16_64( v ) \
|
#define mm256_rol64_8( v, c ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
|
||||||
m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
|
_mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
|
||||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
|
||||||
|
#define mm256_ror64_8( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
|
||||||
|
_mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
|
||||||
|
|
||||||
#define mm256_rol1x16_64( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
|
|
||||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define mm256_ror1x8_64( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
|
|
||||||
0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
|
|
||||||
|
|
||||||
#define mm256_rol1x8_64( v ) \
|
// Rotate elements in each 32 bit lane
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
|
|
||||||
0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
|
|
||||||
|
|
||||||
#define mm256_ror3x8_64( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
|
|
||||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
|
||||||
|
|
||||||
#define mm256_rol3x8_64( v ) \
|
|
||||||
_mm256_shuffle_epi8( v, \
|
|
||||||
m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
|
|
||||||
0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
|
|
||||||
|
|
||||||
|
|
||||||
// Swap 16 bit elements in each 32 bit lane
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
#define mm256_swap16_32( v ) _mm256_rol_epi32( v, 16 )
|
#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
|
||||||
|
|
||||||
|
#define mm256_rol32_8( v ) _mm256_rol_epi32( v, 8 )
|
||||||
|
#define mm256_ror32_8( v ) _mm256_ror_epi32( v, 8 )
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define mm256_swap16_32( v ) \
|
#define mm256_swap32_16( v ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
|
||||||
m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
|
_mm256_srli_epi32( v, 16 ) )
|
||||||
0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
|
|
||||||
|
#define mm256_rol32_8( v ) \
|
||||||
|
_mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
|
||||||
|
_mm256_srli_epi32( v, 8 ) )
|
||||||
|
|
||||||
|
#define mm256_ror32_8( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
|
||||||
|
_mm256_slli_epi32( v, 8 ) )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Swap bytes in vector elements, endian bswap.
|
// Swap bytes in vector elements, endian bswap.
|
||||||
#define mm256_bswap_64( v ) \
|
#define mm256_bswap_64( v ) \
|
||||||
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
|||||||
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
|
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
|
||||||
// makes these macros unnecessary.
|
// makes these macros unnecessary.
|
||||||
|
|
||||||
#define mm256_swap256_512 (v1, v2) \
|
#define mm256_swap512_256( v1, v2 ) \
|
||||||
v1 = _mm256_xor_si256(v1, v2); \
|
v1 = _mm256_xor_si256( v1, v2 ); \
|
||||||
v2 = _mm256_xor_si256(v1, v2); \
|
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||||
v1 = _mm256_xor_si256(v1, v2);
|
v1 = _mm256_xor_si256( v1, v2 );
|
||||||
|
|
||||||
#define mm256_ror1x128_512( v1, v2 ) \
|
#define mm256_ror512_128( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||||
v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
|
v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm256_rol1x128_512( v1, v2 ) \
|
#define mm256_rol512_128( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
__m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
|
||||||
v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
|
v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
|
||||||
|
@@ -15,13 +15,13 @@
|
|||||||
|
|
||||||
// AVX512 intrinsics have a few changes from previous conventions.
|
// AVX512 intrinsics have a few changes from previous conventions.
|
||||||
//
|
//
|
||||||
// Some instructions like cmp and blend use the mask regsiters now instead
|
// cmp instruction now returns a bitmask isnstead of a vector mask.
|
||||||
// a vector mask.
|
// This eliminates the need for the blendv instruction.
|
||||||
//
|
//
|
||||||
// The new rotate instructions require the count to be only an 8 bit
|
// The new rotate instructions require the count to be an 8 bit
|
||||||
// immediate value. The documentation is the same as for shift and
|
// immediate value only. Compilation fails if a variable is used.
|
||||||
// it allows variables. Suspect a compiler issue but it still happens
|
// The documentation is the same as for shift and it works with
|
||||||
// in GCC9.
|
// variables.
|
||||||
//
|
//
|
||||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||||
// usually shuffles accross all lanes.
|
// usually shuffles accross all lanes.
|
||||||
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
|||||||
#define m512_const2_64( i1, i0 ) \
|
#define m512_const2_64( i1, i0 ) \
|
||||||
m512_const1_128( m128_const_64( i1, i0 ) )
|
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||||
|
|
||||||
|
#define m512_const2_32( i1, i0 ) \
|
||||||
|
m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
|
||||||
|
| ( (uint64_t)(i0) & 0xffffffff ) ) )
|
||||||
|
|
||||||
|
|
||||||
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||||
const uint64_t i1, const uint64_t i0 )
|
const uint64_t i1, const uint64_t i0 )
|
||||||
{
|
{
|
||||||
@@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ))
|
0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
|
||||||
|
|
||||||
#define mm512_bswap_32( v ) \
|
#define mm512_bswap_32( v ) \
|
||||||
_mm512_shuffle_epi8( v, \
|
_mm512_shuffle_epi8( v, \
|
||||||
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
{ \
|
{ \
|
||||||
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203, \
|
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||||
0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
|
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||||
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
//
|
//
|
||||||
// Rotate elements in 512 bit vector.
|
// Rotate elements in 512 bit vector.
|
||||||
|
|
||||||
|
|
||||||
#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 )
|
#define mm512_swap_256( v ) _mm512_alignr_epi64( v, v, 4 )
|
||||||
|
|
||||||
|
// 1x64 notation used to disinguish from bit rotation.
|
||||||
#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 )
|
#define mm512_ror_1x128( v ) _mm512_alignr_epi64( v, v, 2 )
|
||||||
#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 )
|
#define mm512_rol_1x128( v ) _mm512_alignr_epi64( v, v, 6 )
|
||||||
|
|
||||||
@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
//
|
//
|
||||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||||
|
|
||||||
|
// Rename these for consistency. Element size is always last.
|
||||||
|
// mm<vectorsize>_<op><lanesize>_<elementsize>
|
||||||
|
|
||||||
|
|
||||||
// Swap hi & lo 128 bits in each 256 bit lane
|
// Swap hi & lo 128 bits in each 256 bit lane
|
||||||
#define mm512_swap128_256( v ) _mm512_permutex_epi64( v, 0x4e )
|
|
||||||
|
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||||
|
|
||||||
// Rotate 256 bit lanes by one 64 bit element
|
// Rotate 256 bit lanes by one 64 bit element
|
||||||
#define mm512_ror1x64_256( v ) _mm512_permutex_epi64( v, 0x39 )
|
|
||||||
#define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 )
|
#define mm512_ror256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||||
|
#define mm512_rol256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||||
|
|
||||||
|
|
||||||
// Rotate 256 bit lanes by one 32 bit element
|
// Rotate 256 bit lanes by one 32 bit element
|
||||||
#define mm512_ror1x32_256( v ) \
|
|
||||||
|
#define mm512_ror256_32( v ) \
|
||||||
_mm512_permutexvar_epi32( m512_const_64( \
|
_mm512_permutexvar_epi32( m512_const_64( \
|
||||||
0x000000080000000f, 0x0000000e0000000d, \
|
0x000000080000000f, 0x0000000e0000000d, \
|
||||||
0x0000000c0000000b, 0x0000000a00000009, \
|
0x0000000c0000000b, 0x0000000a00000009, \
|
||||||
0x0000000000000007, 0x0000000600000005, \
|
0x0000000000000007, 0x0000000600000005, \
|
||||||
0x0000000400000003, 0x0000000200000001 ), v )
|
0x0000000400000003, 0x0000000200000001 ), v )
|
||||||
|
|
||||||
#define mm512_rol1x32_256( v ) \
|
#define mm512_rol256_32( v ) \
|
||||||
_mm512_permutexvar_epi32( m512_const_64( \
|
_mm512_permutexvar_epi32( m512_const_64( \
|
||||||
0x0000000e0000000d, 0x0000000c0000000b, \
|
0x0000000e0000000d, 0x0000000c0000000b, \
|
||||||
0x0000000a00000009, 0x000000080000000f, \
|
0x0000000a00000009, 0x000000080000000f, \
|
||||||
0x0000000600000005, 0x0000000400000003, \
|
0x0000000600000005, 0x0000000400000003, \
|
||||||
0x0000000200000001, 0x0000000000000007 ), v )
|
0x0000000200000001, 0x0000000000000007 ), v )
|
||||||
|
|
||||||
#define mm512_ror1x16_256( v ) \
|
#define mm512_ror256_16( v ) \
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
_mm512_permutexvar_epi16( m512_const_64( \
|
||||||
0x00100001001e001d, 0x001c001b001a0019, \
|
0x00100001001e001d, 0x001c001b001a0019, \
|
||||||
0x0018001700160015, 0x0014001300120011, \
|
0x0018001700160015, 0x0014001300120011, \
|
||||||
0x0000000f000e000d, 0x000c000b000a0009, \
|
0x0000000f000e000d, 0x000c000b000a0009, \
|
||||||
0x0008000700060005, 0x0004000300020001 ), v )
|
0x0008000700060005, 0x0004000300020001 ), v )
|
||||||
|
|
||||||
#define mm512_rol1x16_256( v ) \
|
#define mm512_rol256_16( v ) \
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
_mm512_permutexvar_epi16( m512_const_64( \
|
||||||
0x001e001d001c001b, 0x001a001900180017, \
|
0x001e001d001c001b, 0x001a001900180017, \
|
||||||
0x0016001500140013, 0x001200110010001f, \
|
0x0016001500140013, 0x001200110010001f, \
|
||||||
0x000e000d000c000b, 0x000a000900080007, \
|
0x000e000d000c000b, 0x000a000900080007, \
|
||||||
0x0006000500040003, 0x000200010000000f ), v )
|
0x0006000500040003, 0x000200010000000f ), v )
|
||||||
|
|
||||||
#define mm512_ror1x8_256( v ) \
|
#define mm512_ror256_8( v ) \
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||||
0x203f3e3d3c3b3a39, 0x3837363534333231, \
|
0x203f3e3d3c3b3a39, 0x3837363534333231, \
|
||||||
0x302f2e2d2c2b2a29, 0x2827262524232221, \
|
0x302f2e2d2c2b2a29, 0x2827262524232221, \
|
||||||
0x001f1e1d1c1b1a19, 0x1817161514131211, \
|
0x001f1e1d1c1b1a19, 0x1817161514131211, \
|
||||||
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
|
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
|
||||||
|
|
||||||
#define mm512_rol1x8_256( v ) \
|
#define mm512_rol256_8( v ) \
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||||
0x3e3d3c3b3a393837, 0x363534333231302f, \
|
0x3e3d3c3b3a393837, 0x363534333231302f, \
|
||||||
0x2e2d2c2b2a292827, 0x262524232221203f, \
|
0x2e2d2c2b2a292827, 0x262524232221203f, \
|
||||||
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
// Rotate elements within 128 bit lanes of 512 bit vector.
|
// Rotate elements within 128 bit lanes of 512 bit vector.
|
||||||
|
|
||||||
// Swap hi & lo 64 bits in each 128 bit lane
|
// Swap hi & lo 64 bits in each 128 bit lane
|
||||||
#define mm512_swap64_128( v ) _mm512_shuffle_epi32( v, 0x4e )
|
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||||
|
|
||||||
// Rotate 128 bit lanes by one 32 bit element
|
// Rotate 128 bit lanes by one 32 bit element
|
||||||
#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 )
|
#define mm512_ror128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||||
#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 )
|
#define mm512_rol128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||||
|
|
||||||
#define mm512_ror1x16_128( v ) \
|
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
|
||||||
0x0018001f001e001d, 0x001c001b001a0019, \
|
|
||||||
0x0010001700160015, 0x0014001300120011, \
|
|
||||||
0x0008000f000e000d, 0x000c000b000a0009, \
|
|
||||||
0x0000000700060005, 0x0004000300020001 ), v )
|
|
||||||
|
|
||||||
#define mm512_rol1x16_128( v ) \
|
// Rotate 128 bit lanes by c bytes, faster than building that monstrous
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
// constant above.
|
||||||
0x001e001d001c001b, 0x001a00190018001f, \
|
#define mm512_ror128_8( v, c ) \
|
||||||
0x0016001500140013, 0x0012001100100017, \
|
|
||||||
0x000e000d000c000b, 0x000a00090008000f, \
|
|
||||||
0x0006000500040003, 0x0002000100000007 ), v )
|
|
||||||
|
|
||||||
#define mm512_ror1x8_128( v ) \
|
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
|
||||||
0x303f3e3d3c3b3a39, 0x3837363534333231, \
|
|
||||||
0x202f2e2d2c2b2a29, 0x2827262524232221, \
|
|
||||||
0x101f1e1d1c1b1a19, 0x1817161514131211, \
|
|
||||||
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
|
|
||||||
|
|
||||||
#define mm512_rol1x8_128( v ) \
|
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
|
||||||
0x3e3d3c3b3a393837, 0x363534333231303f, \
|
|
||||||
0x2e2d2c2b2a292827, 0x262524232221202f, \
|
|
||||||
0x1e1d1c1b1a191817, 0x161514131211101f, \
|
|
||||||
0x0e0d0c0b0a090807, 0x060504030201000f ) )
|
|
||||||
|
|
||||||
// Rotate 128 bit lanes by c bytes.
|
|
||||||
#define mm512_bror_128( v, c ) \
|
|
||||||
_mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
|
_mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
|
||||||
_mm512_bslli_epi128( v, 16-(c) ) )
|
_mm512_bslli_epi128( v, 16-(c) ) )
|
||||||
#define mm512_brol_128( v, c ) \
|
#define mm512_rol128_8( v, c ) \
|
||||||
_mm512_or_si512( _mm512_bslli_epi128( v, c ), \
|
_mm512_or_si512( _mm512_bslli_epi128( v, c ), \
|
||||||
_mm512_bsrli_epi128( v, 16-(c) ) )
|
_mm512_bsrli_epi128( v, 16-(c) ) )
|
||||||
|
|
||||||
@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
//
|
//
|
||||||
// Rotate elements within 64 bit lanes.
|
// Rotate elements within 64 bit lanes.
|
||||||
|
|
||||||
|
#define mm512_rol64_x8( v, c ) _mm512_rol_epi64( v, ((c)<<3) )
|
||||||
|
#define mm512_ror64_x8( v, c ) _mm512_ror_epi64( v, ((c)<<3) )
|
||||||
|
|
||||||
// Swap 32 bit elements in each 64 bit lane
|
// Swap 32 bit elements in each 64 bit lane
|
||||||
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||||
|
|
||||||
// Rotate each 64 bit lane by one 16 bit element.
|
// Rotate each 64 bit lane by one 16 bit element.
|
||||||
#define mm512_ror1x16_64( v ) _mm512_ror_epi64( v, 16 )
|
#define mm512_ror64_16( v ) _mm512_ror_epi64( v, 16 )
|
||||||
#define mm512_rol1x16_64( v ) _mm512_rol_epi64( v, 16 )
|
#define mm512_rol64_16( v ) _mm512_rol_epi64( v, 16 )
|
||||||
#define mm512_ror1x8_64( v ) _mm512_ror_epi64( v, 8 )
|
#define mm512_ror64_8( v ) _mm512_ror_epi64( v, 8 )
|
||||||
#define mm512_rol1x8_64( v ) _mm512_rol_epi64( v, 8 )
|
#define mm512_rol64_8( v ) _mm512_rol_epi64( v, 8 )
|
||||||
|
|
||||||
/*
|
|
||||||
#define mm512_ror1x16_64( v ) \
|
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
|
||||||
0x001c001f001e001d, 0x0018001b001a0019, \
|
|
||||||
0x0014001700160015, 0x0010001300120011, \
|
|
||||||
0x000c000f000e000d, 0x0008000b000a0009, \
|
|
||||||
0x0004000700060005, 0x0000000300020001, v )
|
|
||||||
|
|
||||||
#define mm512_rol1x16_64( v ) \
|
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
|
||||||
0x001e001d001c001f, 0x001a00190018001b, \
|
|
||||||
0x0016001500140017, 0x0012001100100013, \
|
|
||||||
0x000e000d000c000f, 0x000a00090008000b, \
|
|
||||||
0x0006000500040007, 0x0002000100000003, v )
|
|
||||||
|
|
||||||
// Rotate each 64 bit lane by one byte.
|
|
||||||
#define mm512_ror1x8_64( v ) \
|
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
|
||||||
0x383F3E3D3C3B3A39, 0x3037363534333231, \
|
|
||||||
0x282F2E2D2C2B2A29, 0x2027262524232221, \
|
|
||||||
0x181F1E1D1C1B1A19, 0x1017161514131211, \
|
|
||||||
0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
|
|
||||||
#define mm512_rol1x8_64( v ) \
|
|
||||||
_mm512_shuffle( v, m512_const_64( \
|
|
||||||
0x3E3D3C3B3A39383F, 0x3635343332313037, \
|
|
||||||
0x2E2D2C2B2A29282F, 0x2625242322212027, \
|
|
||||||
0x1E1D1C1B1A19181F, 0x1615141312111017, \
|
|
||||||
0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
|
|
||||||
*/
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Rotate elements within 32 bit lanes.
|
// Rotate elements within 32 bit lanes.
|
||||||
|
|
||||||
#define mm512_swap16_32( v ) _mm512_ror_epi32( v, 16 )
|
#define mm512_rol32_x8( v, c ) _mm512_rol_epi32( v, ((c)<<2) )
|
||||||
#define mm512_ror1x8_32( v ) _mm512_ror_epi32( v, 8 )
|
#define mm512_ror32_x8( v, c ) _mm512_ror_epi32( v, ((c)<<2) )
|
||||||
#define mm512_rol1x8_32( v ) _mm512_rol_epi32( v, 8 )
|
|
||||||
|
|
||||||
/*
|
|
||||||
#define mm512_swap16_32( v ) \
|
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
|
||||||
0x001e001f001c001d, 0x001a001b00180019, \
|
|
||||||
0x0016001700140015, 0x0012001300100011, \
|
|
||||||
0x000e000f000c000d, 0x000a000b00080009, \
|
|
||||||
0x0006000700040005, 0x0002000300000001 ), v )
|
|
||||||
|
|
||||||
#define mm512_ror1x8_32( v ) \
|
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
|
||||||
0x3C3F3E3D383B3A39, 0x3437363530333231, \
|
|
||||||
0x2C2F2E2D282B2A29, 0x2427262520232221, \
|
|
||||||
0x1C1F1E1D181B1A19, 0x1417161510131211, \
|
|
||||||
0x0C0F0E0D080B0A09, 0x0407060500030201 ))
|
|
||||||
|
|
||||||
#define mm512_rol1x8_32( v ) \
|
|
||||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
|
||||||
0x3E3D3C3F3A39383B, 0x3635343732313033, \
|
|
||||||
0x2E2D2C2F2A29282B, 0x2625242722212023, \
|
|
||||||
0x1E1D1C1F1A19181B, 0x1615141712111013, \
|
|
||||||
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
// These can all be done with 2 permutex2var instructions but they are
|
// These can all be done with 2 permutex2var instructions but they are
|
||||||
// slower than either xor or alignr and require AVX512VBMI.
|
// slower than either xor or alignr and require AVX512VBMI.
|
||||||
|
|
||||||
#define mm512_swap512_1024(v1, v2) \
|
#define mm512_swap1024_512(v1, v2) \
|
||||||
v1 = _mm512_xor_si512(v1, v2); \
|
v1 = _mm512_xor_si512(v1, v2); \
|
||||||
v2 = _mm512_xor_si512(v1, v2); \
|
v2 = _mm512_xor_si512(v1, v2); \
|
||||||
v1 = _mm512_xor_si512(v1, v2);
|
v1 = _mm512_xor_si512(v1, v2);
|
||||||
|
|
||||||
#define mm512_ror1x256_1024( v1, v2 ) \
|
#define mm512_ror1024_256( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||||
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_rol1x256_1024( v1, v2 ) \
|
#define mm512_rol1024_256( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
|
||||||
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_ror1x128_1024( v1, v2 ) \
|
#define mm512_ror1024_128( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
|
||||||
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
|
v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_rol1x128_1024( v1, v2 ) \
|
#define mm512_rol1024_128( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
|
||||||
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
|
v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_ror1x64_1024( v1, v2 ) \
|
#define mm512_ror1024_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
|
||||||
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
|
v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_rol1x64_1024( v1, v2 ) \
|
#define mm512_rol1024_64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
|
__m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
|
||||||
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
|
v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_ror1x32_1024( v1, v2 ) \
|
#define mm512_ror1024_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
|
__m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
|
||||||
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
|
v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
|
||||||
v2 = t; \
|
v2 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm512_rol1x32_1024( v1, v2 ) \
|
#define mm512_rol1024_32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
|
__m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
|
||||||
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
|
v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
|
||||||
|
Reference in New Issue
Block a user