This commit is contained in:
Jay D Dee
2017-12-30 19:19:46 -05:00
parent 79164c24b5
commit 2d2e54f001
66 changed files with 4321 additions and 1475 deletions

View File

@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
algo/heavy/sph_hefty1.c \
algo/heavy/heavy.c \
algo/heavy/bastion.c \
algo/hmq1725.c \
algo/hodl/aes.c \
algo/hodl/hodl-gate.c \
algo/hodl/hodl-wolf.c \
@@ -110,7 +109,7 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h.c \
algo/m7m.c \
algo/neoscrypt.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
@@ -159,16 +158,36 @@ cpuminer_SOURCES = \
algo/whirlpool/whirlpoolx.c \
algo/x11/x11-gate.c \
algo/x11/x11.c \
algo/x11/x11evo.c \
algo/x11/x11-4way.c \
algo/x11/x11gost-gate.c \
algo/x11/x11gost.c \
algo/x11/x11gost-4way.c \
algo/x11/c11-gate.c \
algo/x11/c11.c \
algo/x11/phi1612.c \
algo/x11/c11-4way.c \
algo/x11/x11evo.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \
algo/x13/x13sm3-gate.c \
algo/x13/x13sm3.c \
algo/x13/x13sm3-4way.c \
algo/x13/phi1612-gate.c \
algo/x13/phi1612.c \
algo/x13/phi1612-4way.c \
algo/x14/x14-gate.c \
algo/x14/x14.c \
algo/x14/x14-4way.c \
algo/x15/x15-gate.c \
algo/x15/x15.c \
algo/x15/x15-4way.c \
algo/x17/x17-gate.c \
algo/x17/x17.c \
algo/xevan.c \
algo/x17/x17-4way.c \
algo/x17/xevan-gate.c \
algo/x17/xevan.c \
algo/x17/xevan-4way.c \
algo/x17/hmq1725.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c\
algo/yescrypt/yescrypt-simd.c\

View File

@@ -96,13 +96,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
may work wallet mining but there are no guarantees.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Errata
------

View File

@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Exe name Compile opts Arch name
Exe name Compile flags Arch name
cpuminer-sse2.exe -march=core2 Core2
cpuminer-sse42.exe -march=corei7 Nehalem
cpuminer-aes-sse42.exe -maes -msse4.2" Westmere
cpuminer-aes-avx.exe -march=corei7-avx" Sandybridge, Ivybridge
cpuminer-aes-avx2.exe "-march=core-avx2" Haswell, Broadwell, Skylake, Kabylake
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY"
cpuminer-sse2.exe "-march=core2" Core2
cpuminer-sse42.exe "-march=corei7" Nehalem
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen...
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2
cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
4way requires a CPU with AES and AVX2. It is still under development and
only a few algos are supported. See change log in RELEASE_NOTES in source
package for supported algos.
There is no binary support available for SHA on AMD Ryzen CPUs.
Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
is provided. Four way still uses AVX2.

View File

@@ -27,8 +27,9 @@ Compile Instructions
Requirements:
Intel Core2 or newer, or AMD Steamroller or newer CPU.
64 bit Linux or Windows operating system.
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple is not supported.
Building on linux prerequisites:
@@ -164,6 +165,10 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.7.8
Partial 4way optimization for most X algos including c11, xevan, phi, hsr
v3.7.7
Fixed regression caused by 64 CPU support.
@@ -182,7 +187,7 @@ New algo keccakc for Creative coin with 4way optimizations
Rewrote some AVX/AVX2 code for more consistent implementation and some
optimizing.
Enhanced capabilities check to support 4way, mor eprecise reporting of
Enhanced capabilities check to support 4way, more precise reporting of
features (not all algos use SSE2), and better error messages when using
an incompatible pre-built version (Windows users).

View File

@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_WHIRLPOOLX: register_whirlpoolx_algo ( gate ); break;
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_sib_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;

View File

@@ -849,9 +849,9 @@ blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
@@ -941,10 +941,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
_mm_set1_epi32( 0x010000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
@@ -955,10 +954,9 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
sc->T1 = SPH_C32(0xFFFFFFFF);
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#if defined(HASH_4WAY) && defined(__AES__)
#define NIST5_4WAY
#endif

View File

@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input, uint32_t len)
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
SHA256_Update( &ctx_sha256, input + midlen, tail );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
SHA256_Update( &ctx_sha256, hashA, 32 );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
SHA256_Update( &ctx_sha256, hashA, 32 );
SHA256_Final( hashA, &ctx_sha256 );
SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
#else
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

View File

@@ -267,9 +267,6 @@ c512(sph_shavite_big_context *sc, const void *msg)
#else
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
@@ -379,36 +376,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
@@ -461,36 +458,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, mm_zero );
k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, mm_zero );
k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, mm_zero );
k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, mm_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, mm_zero );
k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, mm_zero );
k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, mm_zero );
k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, mm_zero );

View File

@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
#if defined (SKEIN_4WAY)
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#else
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif

View File

@@ -10,8 +10,14 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
static __thread jh512_4way_context ctx_mid;
/*
void init_tribus_4way_ctx()
{
init_echo( &tribus_4way_ctx, 512 );
}
*/
void tribus_hash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));

View File

@@ -1,22 +1,11 @@
#include "tribus-gate.h"
/*
bool tribus_thread_init()
{
sph_jh512_init( &tribus_ctx.jh );
sph_keccak512_init( &tribus_ctx.keccak );
#ifdef NO_AES_NI
sph_echo512_init( &tribus_ctx.echo );
#else
init_echo( &tribus_ctx.echo, 512 );
#endif
return true;
}
*/
bool register_tribus_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
// init_tribus_4way_ctx();
gate->scanhash = (void*)&scanhash_tribus_4way;
gate->hash = (void*)&tribus_hash_4way;
#else

View File

@@ -4,12 +4,14 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#if defined(HASH_4WAY) && defined(__AES__)
#define TRIBUS_4WAY
#endif
#if defined(TRIBUS_4WAY)
//void init_tribus_4way_ctx();
void tribus_hash_4way( void *state, const void *input );
int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,

View File

@@ -4,6 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate )
{
#if defined (WHIRLPOOL_4WAY)
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_whirlpool_4way;
gate->hash = (void*)&whirlpool_hash_4way;
#else

View File

@@ -4,9 +4,11 @@
#include "algo-gate-api.h"
#include <stdint.h>
/*
#if defined(FOUR_WAY) && defined(__AVX2__)
#define WHIRLPOOL_4WAY
#endif
*/
#if defined (WHIRLPOOL_4WAY)

View File

@@ -3345,8 +3345,10 @@ do { \
#define READ_STATE MUL8(READ_STATE_W)
#define ROUND0 MUL8(ROUND0_W)
#define UPDATE_STATE MUL8(UPDATE_STATE_W)
#define BYTE(x, n) \
_mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
//#define BYTE(x, n) \
// _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
// A very complex, but structured, expression with a mix of scalar
// and vector operations to retrieve specific 64 bit constants from
@@ -3357,23 +3359,51 @@ do { \
// Extract 64 bit vector elements from "in" representing offsets. Unmask the
// low byte of each and scale for use as vector indexes.
// Pack the data in a vector and return it.
/*
#define t_row( inv, row ) \
_mm256_and_si256( \
_mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
// Extract vector element from "lane" of vector "in[row]" and use it to index
// scalar array of constants "table" and return referenced 64 bit entry.
#define t_lane( table, inv, row, lane ) \
table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
// table[ t_rwo( inv, row )[ lane ] ];
*/
// Build a vector from elements of non-contiguous 64 bit data extracted from
// scalar "table".
// reference scalar version 1480 kH/s
/*
// version 1, extract with gather
// 955 kH/s
#define t_lane( inv, row, lane ) \
BYTE( _mm256_extract_epi64( inv, lane ), row ) \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
_mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
t_lane( inv, row, 0) ), 1 )
*/
/*
// version 2, extract with set
// 1100 kH/s
#define t_lane( table, inv, row, lane ) \
table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
*/
// version 3, vector indexing with set
// 1105 kH/s
#define t_lane( table, inv, row, lane ) \
table[ BYTE( inv[ lane ], row ) ] \
#define t_vec( table, inv, row ) \
_mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
t_lane( table, inv, row, 0 ) )
#if SPH_SMALL_FOOTPRINT_WHIRLPOOL

261
algo/x11/c11-4way.c Normal file
View File

@@ -0,0 +1,261 @@
#include "cpuminer-config.h"
#include "c11-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} c11_4way_ctx_holder;
c11_4way_ctx_holder c11_4way_ctx;
void init_c11_4way_ctx()
{
blake512_4way_init( &c11_4way_ctx.blake );
sph_bmw512_init( &c11_4way_ctx.bmw );
init_groestl( &c11_4way_ctx.groestl, 64 );
skein512_4way_init( &c11_4way_ctx.skein );
jh512_4way_init( &c11_4way_ctx.jh );
keccak512_4way_init( &c11_4way_ctx.keccak );
init_luffa( &c11_4way_ctx.luffa, 512 );
cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_4way_ctx.shavite );
init_sd( &c11_4way_ctx.simd, 512 );
init_echo( &c11_4way_ctx.echo, 512 );
}
void c11_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
c11_4way_ctx_holder ctx;
memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 5 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// 6 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
c11_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x11/c11-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "c11-gate.h"
bool register_c11_algo( algo_gate_t* gate )
{
#if defined (C11_4WAY)
init_c11_4way_ctx();
gate->scanhash = (void*)&scanhash_c11_4way;
gate->hash = (void*)&c11_4way_hash;
#else
init_c11_ctx();
gate->scanhash = (void*)&scanhash_c11;
gate->hash = (void*)&c11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x11/c11-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef C11_GATE_H__
#define C11_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define C11_4WAY
#endif
bool register_c11_algo( algo_gate_t* gate );
#if defined(C11_4WAY)
void c11_4way_hash( void *state, const void *input );
int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_c11_4way_ctx();
#endif
void c11_hash( void *state, const void *input );
int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_c11_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "c11-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
#endif
}
void c11hash( void *output, const void *input )
void c11_hash( void *output, const void *input )
{
unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
// uint32_t _ALIGN(64) hash[16];
@@ -157,7 +157,7 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
do
{
be32enc( &endiandata[19], nonce );
c11hash( hash, endiandata );
c11_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
{
pdata[19] = nonce;
@@ -171,13 +171,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_c11_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_c11_ctx();
gate->scanhash = (void*)&scanhash_c11;
gate->hash = (void*)&c11hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

261
algo/x11/x11-4way.c Normal file
View File

@@ -0,0 +1,261 @@
#include "cpuminer-config.h"
#include "x11-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11_4way_ctx_holder;
x11_4way_ctx_holder x11_4way_ctx;
void init_x11_4way_ctx()
{
blake512_4way_init( &x11_4way_ctx.blake );
sph_bmw512_init( &x11_4way_ctx.bmw );
init_groestl( &x11_4way_ctx.groestl, 64 );
skein512_4way_init( &x11_4way_ctx.skein );
jh512_4way_init( &x11_4way_ctx.jh );
keccak512_4way_init( &x11_4way_ctx.keccak );
init_luffa( &x11_4way_ctx.luffa, 512 );
cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_4way_ctx.shavite );
init_sd( &x11_4way_ctx.simd, 512 );
init_echo( &x11_4way_ctx.echo, 512 );
}
void x11_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11_4way_ctx_holder ctx;
memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
// 1 Blake 4way
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -5,13 +5,13 @@ bool register_x11_algo( algo_gate_t* gate )
#if defined (X11_4WAY)
init_x11_4way_ctx();
gate->scanhash = (void*)&scanhash_x11_4way;
gate->hash = (void*)&x11_hash_4way;
gate->hash = (void*)&x11_4way_hash;
#else
init_x11_ctx();
gate->scanhash = (void*)&scanhash_x11;
gate->hash = (void*)&x11_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -4,19 +4,21 @@
#include "algo-gate-api.h"
#include <stdint.h>
//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
// #define X11_4WAY
//#endif
#if defined(HASH_4WAY) && defined(__AES__)
#define X11_4WAY
#endif
bool register_x11_algo( algo_gate_t* gate );
#if defined(X11_4WAY)
void x11_hash_4way( void *state, const void *input );
void x11_4way_hash( void *state, const void *input );
int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11_4way_ctx();
#endif
void x11_hash( void *state, const void *input );

268
algo/x11/x11gost-4way.c Normal file
View File

@@ -0,0 +1,268 @@
#include "cpuminer-config.h"
#include "x11gost-gate.h"
#if defined (__AVX2__) && defined (__AES__)
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
sph_gost512_context gost;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
} x11gost_4way_ctx_holder;
x11gost_4way_ctx_holder x11gost_4way_ctx;
void init_x11gost_4way_ctx()
{
blake512_4way_init( &x11gost_4way_ctx.blake );
sph_bmw512_init( &x11gost_4way_ctx.bmw );
init_groestl( &x11gost_4way_ctx.groestl, 64 );
skein512_4way_init( &x11gost_4way_ctx.skein );
jh512_4way_init( &x11gost_4way_ctx.jh );
keccak512_4way_init( &x11gost_4way_ctx.keccak );
sph_gost512_init( &x11gost_4way_ctx.gost );
init_luffa( &x11gost_4way_ctx.luffa, 512 );
cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11gost_4way_ctx.shavite );
init_sd( &x11gost_4way_ctx.simd, 512 );
init_echo( &x11gost_4way_ctx.echo, 512 );
}
void x11gost_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x11gost_4way_ctx_holder ctx;
memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512( &ctx.gost, hash1, 64 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512_close( &ctx.gost, hash1 );
sph_gost512( &ctx.gost, hash2, 64 );
sph_gost512_close( &ctx.gost, hash2 );
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x11gost_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x11/x11gost-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x11gost-gate.h"
bool register_x11gost_algo( algo_gate_t* gate )
{
#if defined (X11GOST_4WAY)
init_x11gost_4way_ctx();
gate->scanhash = (void*)&scanhash_x11gost_4way;
gate->hash = (void*)&x11gost_4way_hash;
#else
init_x11gost_ctx();
gate->scanhash = (void*)&scanhash_x11gost;
gate->hash = (void*)&x11gost_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x11/x11gost-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X11GOST_GATE_H__
#define X11GOST_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X11GOST_4WAY
#endif
bool register_x11gost_algo( algo_gate_t* gate );
#if defined(X11GOST_4WAY)
void x11gost_4way_hash( void *state, const void *input );
int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11gost_4way_ctx();
#endif
void x11gost_hash( void *state, const void *input );
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x11gost_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x11gost-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
hashState_echo echo;
hashState_groestl groestl;
#endif
} sib_ctx_holder;
} x11gost_ctx_holder;
sib_ctx_holder sib_ctx;
x11gost_ctx_holder x11gost_ctx;
void init_sib_ctx()
void init_x11gost_ctx()
{
sph_gost512_init(&sib_ctx.gost);
sph_shavite512_init(&sib_ctx.shavite);
init_luffa( &sib_ctx.luffa, 512 );
cubehashInit( &sib_ctx.cube, 512, 16, 32 );
init_sd( &sib_ctx.simd, 512 );
sph_gost512_init( &x11gost_ctx.gost );
sph_shavite512_init( &x11gost_ctx.shavite );
init_luffa( &x11gost_ctx.luffa, 512 );
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
init_sd( &x11gost_ctx.simd, 512 );
#ifdef NO_AES_NI
sph_groestl512_init( &sib_ctx.groestl );
sph_echo512_init( &sib_ctx.echo );
sph_groestl512_init( &x11gost_ctx.groestl );
sph_echo512_init( &x11gost_ctx.echo );
#else
init_echo( &sib_ctx.echo, 512 );
init_groestl( &sib_ctx.groestl, 64 );
init_echo( &x11gost_ctx.echo, 512 );
init_groestl( &x11gost_ctx.groestl, 64 );
#endif
}
void sibhash(void *output, const void *input)
void x11gost_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (64)));
#define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
sph_u64 hashctA;
sph_u64 hashctB;
sib_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
DECL_BLK;
BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
memcpy(output, hashA, 32);
}
int scanhash_sib(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -156,7 +156,7 @@ int scanhash_sib(int thr_id, struct work *work,
do {
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
sibhash(hash, endiandata);
x11gost_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
@@ -172,12 +172,3 @@ int scanhash_sib(int thr_id, struct work *work,
return 0;
}
bool register_sib_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_sib_ctx();
gate->scanhash = (void*)&scanhash_sib;
gate->hash = (void*)&sibhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

186
algo/x13/phi1612-4way.c Normal file
View File

@@ -0,0 +1,186 @@
#include "x13-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/gost/sph_gost.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct {
skein512_4way_context skein;
jh512_4way_context jh;
cubehashParam cube;
sph_fugue512_context fugue;
sph_gost512_context gost;
hashState_echo echo;
} phi1612_4way_ctx_holder;
phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
void init_phi1612_4way_ctx()
{
skein512_4way_init( &phi1612_4way_ctx.skein );
jh512_4way_init( &phi1612_4way_ctx.jh );
cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
sph_fugue512_init( &phi1612_4way_ctx.fugue );
sph_gost512_init( &phi1612_4way_ctx.gost );
init_echo( &phi1612_4way_ctx.echo, 512 );
};
void phi1612_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
phi1612_4way_ctx_holder ctx;
memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
// Skein parallel 4way
skein512_4way( &ctx.skein, input, 80 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// Gost
sph_gost512( &ctx.gost, hash0, 64 );
sph_gost512_close( &ctx.gost, hash0 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash1, 64 );
sph_gost512_close( &ctx.gost, hash1 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash2, 64 );
sph_gost512_close( &ctx.gost, hash2 );
sph_gost512_init( &ctx.gost );
sph_gost512( &ctx.gost, hash3, 64 );
sph_gost512_close( &ctx.gost, hash3 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t _ALIGN(64) endiandata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0cff;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
phi1612_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x13/phi1612-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "phi1612-gate.h"
bool register_phi1612_algo( algo_gate_t* gate )
{
#if defined(PHI1612_4WAY)
init_phi1612_4way_ctx();
gate->scanhash = (void*)&scanhash_phi1612_4way;
gate->hash = (void*)&phi1612_4way_hash;
#else
init_phi1612_ctx();
gate->scanhash = (void*)&scanhash_phi1612;
gate->hash = (void*)&phi1612_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x13/phi1612-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef PHI1612_GATE_H__
#define PHI1612_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define PHI1612_4WAY
#endif
bool register_phi1612_algo( algo_gate_t* gate );
#if defined(PHI1612_4WAY)
void phi1612_4way_hash( void *state, const void *input );
int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_phi1612_4way_ctx();
#endif
void phi1612_hash( void *state, const void *input );
int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_phi1612_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "phi1612-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -33,7 +33,7 @@ phi_ctx_holder phi_ctx;
static __thread sph_skein512_context phi_skein_mid
__attribute__ ((aligned (64)));
void init_phi_ctx()
void init_phi1612_ctx()
{
sph_skein512_init( &phi_ctx.skein );
sph_jh512_init( &phi_ctx.jh );
@@ -53,7 +53,7 @@ void phi_skein_midstate( const void* input )
sph_skein512( &phi_skein_mid, input, 64 );
}
void phi1612hash(void *output, const void *input)
void phi1612_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (64)));
phi_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -112,7 +112,7 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
do {
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
phi1612hash(hash, endiandata);
phi1612_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
@@ -128,12 +128,3 @@ int scanhash_phi1612( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_phi1612_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_phi_ctx();
gate->scanhash = (void*)&scanhash_phi1612;
gate->hash = (void*)&phi1612hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

293
algo/x13/x13-4way.c Normal file
View File

@@ -0,0 +1,293 @@
#include "x13-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
} x13_4way_ctx_holder;
x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
void init_x13_4way_ctx()
{
blake512_4way_init( &x13_4way_ctx.blake );
sph_bmw512_init( &x13_4way_ctx.bmw );
init_groestl( &x13_4way_ctx.groestl, 64 );
skein512_4way_init( &x13_4way_ctx.skein );
jh512_4way_init( &x13_4way_ctx.jh );
keccak512_4way_init( &x13_4way_ctx.keccak );
init_luffa( &x13_4way_ctx.luffa, 512 );
cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13_4way_ctx.shavite );
init_sd( &x13_4way_ctx.simd, 512 );
init_echo( &x13_4way_ctx.echo, 512 );
sph_hamsi512_init( &x13_4way_ctx.hamsi );
sph_fugue512_init( &x13_4way_ctx.fugue );
};
void x13_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x13_4way_ctx_holder ctx;
memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x13_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x13_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x13_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x13_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x13/x13-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x13-gate.h"
bool register_x13_algo( algo_gate_t* gate )
{
#if defined (X13_4WAY)
init_x13_4way_ctx();
gate->scanhash = (void*)&scanhash_x13_4way;
gate->hash = (void*)&x13_4way_hash;
#else
init_x13_ctx();
gate->scanhash = (void*)&scanhash_x13;
gate->hash = (void*)&x13hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x13/x13-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X13_GATE_H__
#define X13_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X13_4WAY
#endif
bool register_x13_algo( algo_gate_t* gate );
#if defined(X13_4WAY)
void x13_4way_hash( void *state, const void *input );
int scanhash_x13_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x13_4way_ctx();
#endif
void x13hash( void *state, const void *input );
int scanhash_x13( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x13_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x13-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -68,7 +68,7 @@ void init_x13_ctx()
sph_fugue512_init( &x13_ctx.fugue );
};
static void x13hash(void *output, const void *input)
void x13hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
@@ -249,15 +249,3 @@ int scanhash_x13(int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
bool register_x13_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x13_ctx();
gate->scanhash = (void*)&scanhash_x13;
gate->hash = (void*)&x13hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

328
algo/x13/x13sm3-4way.c Normal file
View File

@@ -0,0 +1,328 @@
#include "x13sm3-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/sm3/sph_sm3.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sm3_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
} x13sm3_4way_ctx_holder;
x13sm3_4way_ctx_holder x13sm3_4way_ctx __attribute__ ((aligned (64)));
static __thread blake512_4way_context x13sm3_ctx_mid;
void init_x13sm3_4way_ctx()
{
blake512_4way_init( &x13sm3_4way_ctx.blake );
sph_bmw512_init( &x13sm3_4way_ctx.bmw );
init_groestl( &x13sm3_4way_ctx.groestl, 64 );
skein512_4way_init( &x13sm3_4way_ctx.skein );
jh512_4way_init( &x13sm3_4way_ctx.jh );
keccak512_4way_init( &x13sm3_4way_ctx.keccak );
init_luffa( &x13sm3_4way_ctx.luffa, 512 );
cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13sm3_4way_ctx.shavite );
init_sd( &x13sm3_4way_ctx.simd, 512 );
init_echo( &x13sm3_4way_ctx.echo, 512 );
sm3_init( &x13sm3_4way_ctx.sm3 );
sph_hamsi512_init( &x13sm3_4way_ctx.hamsi );
sph_fugue512_init( &x13sm3_4way_ctx.fugue );
};
void x13sm3_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x13sm3_4way_ctx_holder ctx;
memcpy( &ctx, &x13sm3_4way_ctx, sizeof(x13sm3_4way_ctx) );
// Blake
memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 );
// blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x13sm3_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x13sm3_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x13sm3_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// SM3
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
memset( sm3_hash1, 0, sizeof sm3_hash1 );
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
memset( sm3_hash2, 0, sizeof sm3_hash2 );
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sph_sm3( &ctx.sm3, hash0, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash0 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash1, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash1 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash2, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash2 );
memcpy( &ctx.sm3, &x13sm3_4way_ctx.sm3, sizeof(sm3_ctx_t) );
sph_sm3( &ctx.sm3, hash3, 64 );
sph_sm3_close( &ctx.sm3, sm3_hash3 );
// Hamsi
sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
// Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
blake512_4way_init( &x13sm3_ctx_mid );
blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x13sm3_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x13/x13sm3-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x13sm3-gate.h"
bool register_x13sm3_algo( algo_gate_t* gate )
{
#if defined (X13SM3_4WAY)
init_x13sm3_4way_ctx();
gate->scanhash = (void*)&scanhash_x13sm3_4way;
gate->hash = (void*)&x13sm3_4way_hash;
#else
init_x13sm3_ctx();
gate->scanhash = (void*)&scanhash_x13sm3;
gate->hash = (void*)&x13sm3_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x13/x13sm3-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X13SM3_GATE_H__
#define X13SM3_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X13SM3_4WAY
#endif
bool register_x13sm3_algo( algo_gate_t* gate );
#if defined(X13SM3_4WAY)
void x13sm3_4way_hash( void *state, const void *input );
int scanhash_x13sm3_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x13sm3_4way_ctx();
#endif
void x13sm3_hash( void *state, const void *input );
int scanhash_x13sm3( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x13sm3_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x13sm3-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -49,7 +49,7 @@ typedef struct {
hsr_ctx_holder hsr_ctx;
void init_hsr_ctx()
void init_x13sm3_ctx()
{
#ifdef NO_AES_NI
sph_groestl512_init(&hsr_ctx.groestl);
@@ -67,7 +67,7 @@ void init_hsr_ctx()
sph_fugue512_init(&hsr_ctx.fugue);
};
static void x13sm3hash(void *output, const void *input)
void x13sm3_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
@@ -213,7 +213,7 @@ int scanhash_x13sm3( int thr_id, struct work *work,
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x13sm3hash(hash64, endiandata);
x13sm3_hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
@@ -240,13 +240,3 @@ int scanhash_x13sm3( int thr_id, struct work *work,
return 0;
}
bool register_x13sm3_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_hsr_ctx();
gate->scanhash = (void*)&scanhash_x13sm3;
gate->hash = (void*)&x13sm3hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

310
algo/x14/x14-4way.c Normal file
View File

@@ -0,0 +1,310 @@
#include "x14-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
} x14_4way_ctx_holder;
x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
void init_x14_4way_ctx()
{
blake512_4way_init( &x14_4way_ctx.blake );
sph_bmw512_init( &x14_4way_ctx.bmw );
init_groestl( &x14_4way_ctx.groestl, 64 );
skein512_4way_init( &x14_4way_ctx.skein );
jh512_4way_init( &x14_4way_ctx.jh );
keccak512_4way_init( &x14_4way_ctx.keccak );
init_luffa( &x14_4way_ctx.luffa, 512 );
cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x14_4way_ctx.shavite );
init_sd( &x14_4way_ctx.simd, 512 );
init_echo( &x14_4way_ctx.echo, 512 );
sph_hamsi512_init( &x14_4way_ctx.hamsi );
sph_fugue512_init( &x14_4way_ctx.fugue );
sph_shabal512_init( &x14_4way_ctx.shabal );
};
void x14_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x14_4way_ctx_holder ctx;
memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x14_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512_close( &ctx.shavite, hash1 );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x14_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal
sph_shabal512( &ctx.shabal, hash0, 64 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash1, 64 );
sph_shabal512_close( &ctx.shabal, hash1 );
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash2, 64 );
sph_shabal512_close( &ctx.shabal, hash2 );
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash3, 64 );
sph_shabal512_close( &ctx.shabal, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x14_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x14/x14-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x14-gate.h"
bool register_x14_algo( algo_gate_t* gate )
{
#if defined (X14_4WAY)
init_x14_4way_ctx();
gate->scanhash = (void*)&scanhash_x14_4way;
gate->hash = (void*)&x14_4way_hash;
#else
init_x14_ctx();
gate->scanhash = (void*)&scanhash_x14;
gate->hash = (void*)&x14hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x14/x14-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X14_GATE_H__
#define X14_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X14_4WAY
#endif
bool register_x14_algo( algo_gate_t* gate );
#if defined(X14_4WAY)
void x14_4way_hash( void *state, const void *input );
int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x14_4way_ctx();
#endif
void x14hash( void *state, const void *input );
int scanhash_x14( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x14_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x14-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -72,7 +72,7 @@ void init_x14_ctx()
sph_shabal512_init(&x14_ctx.shabal);
};
static void x14hash(void *output, const void *input)
void x14hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
@@ -248,14 +248,3 @@ int scanhash_x14(int thr_id, struct work *work,
pdata[19] = n;
return 0;
}
bool register_x14_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x14_ctx();
gate->scanhash = (void*)&scanhash_x14;
gate->hash = (void*)&x14hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

329
algo/x15/x15-4way.c Normal file
View File

@@ -0,0 +1,329 @@
#include "x15-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
} x15_4way_ctx_holder;
x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
void init_x15_4way_ctx()
{
blake512_4way_init( &x15_4way_ctx.blake );
sph_bmw512_init( &x15_4way_ctx.bmw );
init_groestl( &x15_4way_ctx.groestl, 64 );
skein512_4way_init( &x15_4way_ctx.skein );
jh512_4way_init( &x15_4way_ctx.jh );
keccak512_4way_init( &x15_4way_ctx.keccak );
init_luffa( &x15_4way_ctx.luffa, 512 );
cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x15_4way_ctx.shavite );
init_sd( &x15_4way_ctx.simd, 512 );
init_echo( &x15_4way_ctx.echo, 512 );
sph_hamsi512_init( &x15_4way_ctx.hamsi );
sph_fugue512_init( &x15_4way_ctx.fugue );
sph_shabal512_init( &x15_4way_ctx.shabal );
sph_whirlpool_init( &x15_4way_ctx.whirlpool );
};
void x15_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x15_4way_ctx_holder ctx;
memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x15_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x15_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x15_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x15_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal
sph_shabal512( &ctx.shabal, hash0, 64 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash1, 64 );
sph_shabal512_close( &ctx.shabal, hash1 );
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash2, 64 );
sph_shabal512_close( &ctx.shabal, hash2 );
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash3, 64 );
sph_shabal512_close( &ctx.shabal, hash3 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &x15_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x15_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/x15/x15-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "x15-gate.h"
bool register_x15_algo( algo_gate_t* gate )
{
#if defined (X15_4WAY)
init_x15_4way_ctx();
gate->scanhash = (void*)&scanhash_x15_4way;
gate->hash = (void*)&x15_4way_hash;
#else
init_x15_ctx();
gate->scanhash = (void*)&scanhash_x15;
gate->hash = (void*)&x15hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

32
algo/x15/x15-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X15_GATE_H__
#define X15_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X15_4WAY
#endif
bool register_x15_algo( algo_gate_t* gate );
#if defined(X15_4WAY)
void x15_4way_hash( void *state, const void *input );
int scanhash_x15_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x15_4way_ctx();
#endif
void x15hash( void *state, const void *input );
int scanhash_x15( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x15_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x15-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -74,7 +74,7 @@ void init_x15_ctx()
sph_whirlpool_init( &x15_ctx.whirlpool );
};
static void x15hash(void *output, const void *input)
void x15hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
@@ -260,13 +260,3 @@ int scanhash_x15(int thr_id, struct work *work,
pdata[19] = n;
return 0;
}
bool register_x15_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_x15_ctx();
gate->scanhash = (void*)&scanhash_x15;
gate->hash = (void*)&x15hash;
return true;
};

364
algo/x17/x17-4way.c Normal file
View File

@@ -0,0 +1,364 @@
#include "x17-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include <openssl/sha.h>
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
} x17_4way_ctx_holder;
x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
void init_x17_4way_ctx()
{
blake512_4way_init( &x17_4way_ctx.blake );
sph_bmw512_init( &x17_4way_ctx.bmw );
init_groestl( &x17_4way_ctx.groestl, 64 );
skein512_4way_init( &x17_4way_ctx.skein );
jh512_4way_init( &x17_4way_ctx.jh );
keccak512_4way_init( &x17_4way_ctx.keccak );
init_luffa( &x17_4way_ctx.luffa, 512 );
cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x17_4way_ctx.shavite );
init_sd( &x17_4way_ctx.simd, 512 );
init_echo( &x17_4way_ctx.echo, 512 );
sph_hamsi512_init( &x17_4way_ctx.hamsi );
sph_fugue512_init( &x17_4way_ctx.fugue );
sph_shabal512_init( &x17_4way_ctx.shabal );
sph_whirlpool_init( &x17_4way_ctx.whirlpool );
SHA512_Init( &x17_4way_ctx.sha512 );
sph_haval256_5_init( &x17_4way_ctx.haval );
};
void x17_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x17_4way_ctx_holder ctx;
memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 2 Bmw
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, 64 );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, 64 );
sph_bmw512_close( &ctx.bmw, hash3 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial to the end
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, 64 );
memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, 64 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi
sph_hamsi512( &ctx.hamsi, hash0, 64 );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, 64 );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, 64 );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, 64 );
sph_hamsi512_close( &ctx.hamsi, hash3 );
// 13 Fugue
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal
sph_shabal512( &ctx.shabal, hash0, 64 );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash1, 64 );
sph_shabal512_close( &ctx.shabal, hash1 );
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash2, 64 );
sph_shabal512_close( &ctx.shabal, hash2 );
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash3, 64 );
sph_shabal512_close( &ctx.shabal, hash3 );
// 15 Whirlpool
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// 16 SHA512
SHA512_Update( &ctx.sha512, hash0, 64 );
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash1, 64 );
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash2, 64 );
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash3, 64 );
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
// 17 Haval
sph_haval256_5( &ctx.haval, (const void*)hash0, 64 );
sph_haval256_5_close( &ctx.haval, hash0 );
memcpy( &ctx.haval, &x17_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash1, 64 );
sph_haval256_5_close( &ctx.haval, hash1 );
memcpy( &ctx.haval, &x17_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash2, 64 );
sph_haval256_5_close( &ctx.haval, hash2 );
memcpy( &ctx.haval, &x17_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash3, 64 );
sph_haval256_5_close( &ctx.haval, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x17_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/x17/x17-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "x17-gate.h"
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_4WAY)
init_x17_4way_ctx();
gate->scanhash = (void*)&scanhash_x17_4way;
gate->hash = (void*)&x17_4way_hash;
#else
init_x17_ctx();
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
return true;
};

32
algo/x17/x17-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X17_GATE_H__
#define X17_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define X17_4WAY
#endif
bool register_x17_algo( algo_gate_t* gate );
#if defined(X17_4WAY)
void x17_4way_hash( void *state, const void *input );
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x17_4way_ctx();
#endif
void x17_hash( void *state, const void *input );
int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x17_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "x17-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -86,7 +86,7 @@ void init_x17_ctx()
sph_haval256_5_init(&x17_ctx.haval);
};
static void x17hash(void *output, const void *input)
void x17_hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (64)));
#define hashB hash+64
@@ -248,7 +248,7 @@ int scanhash_x17(int thr_id, struct work *work,
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x17hash(hash64, endiandata);
x17_hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
@@ -281,7 +281,7 @@ int scanhash_x17(int thr_id, struct work *work,
pdata[19] = n;
return 0;
}
/*
bool register_x17_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -290,4 +290,4 @@ bool register_x17_algo( algo_gate_t* gate )
gate->hash = (void*)&x17hash;
return true;
};
*/

556
algo/x17/xevan-4way.c Normal file
View File

@@ -0,0 +1,556 @@
#include "xevan-gate.h"
#if defined(__AVX2__) && defined(__AES__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include <openssl/sha.h>
typedef struct {
blake512_4way_context blake;
sph_bmw512_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
hashState_echo echo;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
} xevan_4way_ctx_holder;
xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
static __thread blake512_4way_context xevan_blake_4way_mid
__attribute__ ((aligned (64)));
void init_xevan_4way_ctx()
{
blake512_4way_init(&xevan_4way_ctx.blake);
sph_bmw512_init(&xevan_4way_ctx.bmw);
init_groestl( &xevan_4way_ctx.groestl, 64 );
skein512_4way_init(&xevan_4way_ctx.skein);
jh512_4way_init(&xevan_4way_ctx.jh);
keccak512_4way_init(&xevan_4way_ctx.keccak);
init_luffa( &xevan_4way_ctx.luffa, 512 );
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &xevan_4way_ctx.shavite );
init_sd( &xevan_4way_ctx.simd, 512 );
init_echo( &xevan_4way_ctx.echo, 512 );
sph_hamsi512_init( &xevan_4way_ctx.hamsi );
sph_fugue512_init( &xevan_4way_ctx.fugue );
sph_shabal512_init( &xevan_4way_ctx.shabal );
sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
SHA512_Init( &xevan_4way_ctx.sha512 );
sph_haval256_5_init( &xevan_4way_ctx.haval );
};
void xevan_4way_blake512_midstate( const void* input )
{
memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &xevan_blake_4way_mid, input, 64 );
}
void xevan_4way_hash( void *output, const void *input )
{
uint64_t hash0[16] __attribute__ ((aligned (64)));
uint64_t hash1[16] __attribute__ ((aligned (64)));
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
const int dataLen = 128;
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
memcpy( &ctx.blake, &xevan_blake_4way_mid,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &ctx.blake, input + (midlen<<2), tail );
blake512_4way_close(&ctx.blake, vhash);
memset( &vhash[8<<2], 0, 64<<2 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_bmw512( &ctx.bmw, hash0, dataLen );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, dataLen );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, dataLen );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, dataLen );
sph_bmw512_close( &ctx.bmw, hash3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
sph_hamsi512( &ctx.hamsi, hash0, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash3 );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_shabal512( &ctx.shabal, hash0, dataLen );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash1, dataLen );
sph_shabal512_close( &ctx.shabal, hash1 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash2, dataLen );
sph_shabal512_close( &ctx.shabal, hash2 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash3, dataLen );
sph_shabal512_close( &ctx.shabal, hash3 );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
SHA512_Update( &ctx.sha512, hash0, dataLen );
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash1, dataLen );
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash2, dataLen );
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash3, dataLen );
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
sph_haval256_5_close( &ctx.haval, hash0 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
sph_haval256_5_close( &ctx.haval, hash1 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
sph_haval256_5_close( &ctx.haval, hash2 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
sph_haval256_5_close( &ctx.haval, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
blake512_4way( &ctx.blake, vhash, dataLen );
blake512_4way_close(&ctx.blake, vhash);
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_bmw512( &ctx.bmw, hash0, dataLen );
sph_bmw512_close( &ctx.bmw, hash0 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash1, dataLen );
sph_bmw512_close( &ctx.bmw, hash1 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash2, dataLen );
sph_bmw512_close( &ctx.bmw, hash2 );
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
sph_bmw512( &ctx.bmw, hash3, dataLen );
sph_bmw512_close( &ctx.bmw, hash3 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
(const BitSequence*)hash0, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
(const BitSequence*)hash1, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
(const BitSequence*)hash2, dataLen );
memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
(const BitSequence*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
update_final_sd( &ctx.simd, (BitSequence *)hash0,
(const BitSequence *)hash0, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash1,
(const BitSequence *)hash1, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash2,
(const BitSequence *)hash2, dataLen<<3 );
memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
update_final_sd( &ctx.simd, (BitSequence *)hash3,
(const BitSequence *)hash3, dataLen<<3 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
sph_hamsi512( &ctx.hamsi, hash0, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash0 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash1, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash1 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash2, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash2 );
memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
sph_hamsi512( &ctx.hamsi, hash3, dataLen );
sph_hamsi512_close( &ctx.hamsi, hash3 );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
sph_shabal512( &ctx.shabal, hash0, dataLen );
sph_shabal512_close( &ctx.shabal, hash0 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash1, dataLen );
sph_shabal512_close( &ctx.shabal, hash1 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash2, dataLen );
sph_shabal512_close( &ctx.shabal, hash2 );
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
sizeof(sph_shabal512_context) );
sph_shabal512( &ctx.shabal, hash3, dataLen );
sph_shabal512_close( &ctx.shabal, hash3 );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
SHA512_Update( &ctx.sha512, hash0, dataLen );
SHA512_Final( (unsigned char*)hash0, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash1, dataLen );
SHA512_Final( (unsigned char*)hash1, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash2, dataLen );
SHA512_Final( (unsigned char*)hash2, &ctx.sha512 );
memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) );
SHA512_Update( &ctx.sha512, hash3, dataLen );
SHA512_Final( (unsigned char*)hash3, &ctx.sha512 );
sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen );
sph_haval256_5_close( &ctx.haval, hash0 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen );
sph_haval256_5_close( &ctx.haval, hash1 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen );
sph_haval256_5_close( &ctx.haval, hash2 );
memcpy( &ctx.haval, &xevan_4way_ctx.haval,
sizeof(sph_haval256_5_context) );
sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen );
sph_haval256_5_close( &ctx.haval, hash3 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
// uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
xevan_4way_blake512_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
xevan_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

24
algo/x17/xevan-gate.c Normal file
View File

@@ -0,0 +1,24 @@
#include "xevan-gate.h"
void xevan_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_xevan_algo( algo_gate_t* gate )
{
#if defined (XEVAN_4WAY)
init_xevan_4way_ctx();
gate->scanhash = (void*)&scanhash_xevan_4way;
gate->hash = (void*)&xevan_4way_hash;
#else
init_xevan_ctx();
gate->scanhash = (void*)&scanhash_xevan;
gate->hash = (void*)&xevan_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->set_target = (void*)&xevan_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

32
algo/x17/xevan-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef XEVAN_GATE_H__
#define XEVAN_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#define XEVAN_4WAY
#endif
bool register_xevan_algo( algo_gate_t* gate );
#if defined(XEVAN_4WAY)
void xevan_4way_hash( void *state, const void *input );
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_xevan_4way_ctx();
#endif
void xevan_hash( void *state, const void *input );
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_xevan_ctx();
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "xevan-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -286,19 +286,3 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
return 0;
}
void xevan_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_xevan_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_xevan_ctx();
gate->scanhash = (void*)&scanhash_xevan;
gate->hash = (void*)&xevan_hash;
gate->set_target = (void*)&xevan_set_target;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -1,935 +0,0 @@
/*-
* Copyright 2009 Colin Percival
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include "sha256_Y.h"
#include "sysendian.h"
#include "yescrypt-platform.h"
static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
{
do {
*dest++ = *src++; *dest++ = *src++;
*dest++ = *src++; *dest++ = *src++;
} while (count -= 4);
}
static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
{
do {
*dest++ ^= *src++; *dest++ ^= *src++;
*dest++ ^= *src++; *dest++ ^= *src++;
} while (count -= 4);
}
typedef union {
uint32_t w[16];
uint64_t d[8];
} salsa20_blk_t;
static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
{
#define COMBINE(out, in1, in2) \
Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
COMBINE(0, 0, 2)
COMBINE(1, 5, 7)
COMBINE(2, 2, 4)
COMBINE(3, 7, 1)
COMBINE(4, 4, 6)
COMBINE(5, 1, 3)
COMBINE(6, 6, 0)
COMBINE(7, 3, 5)
#undef COMBINE
}
static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
{
#define COMBINE(out, in1, in2) \
Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
COMBINE(0, 0, 6)
COMBINE(1, 5, 3)
COMBINE(2, 2, 0)
COMBINE(3, 7, 5)
COMBINE(4, 4, 2)
COMBINE(5, 1, 7)
COMBINE(6, 6, 4)
COMBINE(7, 3, 1)
#undef COMBINE
}
/**
* salsa20_8(B):
* Apply the salsa20/8 core to the provided block.
*/
static void salsa20_8(uint64_t B[8])
{
size_t i;
salsa20_blk_t X;
#define x X.w
salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
for (i = 0; i < 8; i += 2) {
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns */
x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9);
x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18);
x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9);
x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18);
x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9);
x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18);
x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9);
x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18);
/* Operate on rows */
x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9);
x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18);
x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9);
x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18);
x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9);
x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18);
x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9);
x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18);
#undef R
}
#undef x
{
salsa20_blk_t Y;
salsa20_simd_shuffle(&X, &Y);
for (i = 0; i < 16; i += 4) {
((salsa20_blk_t *)B)->w[i] += Y.w[i];
((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
}
}
}
/**
* blockmix_salsa8(Bin, Bout, X, r):
* Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r
* bytes in length; the output Bout must also be the same size. The
* temporary space X must be 64 bytes.
*/
static void
blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
{
size_t i;
/* 1: X <-- B_{2r - 1} */
blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
/* 2: for i = 0 to 2r - 1 do */
for (i = 0; i < 2 * r; i += 2) {
/* 3: X <-- H(X \xor B_i) */
blkxor(X, &Bin[i * 8], 8);
salsa20_8(X);
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
blkcpy(&Bout[i * 4], X, 8);
/* 3: X <-- H(X \xor B_i) */
blkxor(X, &Bin[i * 8 + 8], 8);
salsa20_8(X);
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
blkcpy(&Bout[i * 4 + r * 8], X, 8);
}
}
/* These are tunable */
#define S_BITS 8
#define S_SIMD 2
#define S_P 4
#define S_ROUNDS 6
/* Number of S-boxes. Not tunable, hard-coded in a few places. */
#define S_N 2
/* Derived values. Not tunable on their own. */
#define S_SIZE1 (1 << S_BITS)
#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
#define S_P_SIZE (S_P * S_SIMD)
#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
/**
* pwxform(B):
* Transform the provided block using the provided S-boxes.
*/
static void block_pwxform(uint64_t * B, const uint64_t * S)
{
uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
const uint8_t *S0 = (const uint8_t *)S;
const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
size_t i, j;
#if S_SIMD > 2
size_t k;
#endif
for (j = 0; j < S_P; j++) {
uint64_t *Xj = X[j];
uint64_t x0 = Xj[0];
#if S_SIMD > 1
uint64_t x1 = Xj[1];
#endif
for (i = 0; i < S_ROUNDS; i++) {
uint64_t x = x0 & S_MASK2;
const uint64_t *p0, *p1;
p0 = (const uint64_t *)(S0 + (uint32_t)x);
p1 = (const uint64_t *)(S1 + (x >> 32));
x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
x0 += p0[0];
x0 ^= p1[0];
#if S_SIMD > 1
x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
x1 += p0[1];
x1 ^= p1[1];
#endif
#if S_SIMD > 2
for (k = 2; k < S_SIMD; k++) {
x = Xj[k];
x = (uint64_t)(x >> 32) * (uint32_t)x;
x += p0[k];
x ^= p1[k];
Xj[k] = x;
}
#endif
}
Xj[0] = x0;
#if S_SIMD > 1
Xj[1] = x1;
#endif
}
}
/**
* blockmix_pwxform(Bin, Bout, S, r):
* Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must
* be 128r bytes in length; the output Bout must also be the same size.
*
* S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
* need to refer to both functions via the same function pointers.
*/
static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
{
size_t r1, r2, i;
/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
r1 = r * 128 / (S_P_SIZE * 8);
/* X <-- B_{r1 - 1} */
blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
/* X <-- X \xor B_i */
blkxor(Bout, Bin, S_P_SIZE);
/* X <-- H'(X) */
/* B'_i <-- X */
block_pwxform(Bout, S);
/* for i = 0 to r1 - 1 do */
for (i = 1; i < r1; i++) {
/* X <-- X \xor B_i */
blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
S_P_SIZE);
blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
/* X <-- H'(X) */
/* B'_i <-- X */
block_pwxform(&Bout[i * S_P_SIZE], S);
}
/* Handle partial blocks */
if (i * S_P_SIZE < r * 16)
blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
r * 16 - i * S_P_SIZE);
i = (r1 - 1) * S_P_SIZE / 8;
/* Convert 128-byte blocks to 64-byte blocks */
r2 = r * 2;
/* B'_i <-- H(B'_i) */
salsa20_8(&Bout[i * 8]);
i++;
for (; i < r2; i++) {
/* B'_i <-- H(B'_i \xor B'_{i-1}) */
blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
salsa20_8(&Bout[i * 8]);
}
}
/**
* integerify(B, r):
* Return the result of parsing B_{2r-1} as a little-endian integer.
*/
static __inline uint64_t
integerify(const uint64_t * B, size_t r)
{
/*
* Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
* word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also
* in host byte order, as it should be.
*/
const uint64_t * X = &B[(2 * r - 1) * 8];
uint32_t lo = (uint32_t) X[0];
uint32_t hi = (uint32_t) (X[6] >> 32);
return ((uint64_t)hi << 32) + lo;
}
/**
* smix1(B, r, N, flags, V, NROM, shared, XY, S):
* Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage XY must be 256r + 64 bytes in length. The value N must be even and
* no smaller than 2.
*/
static void
smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
uint64_t * XY, uint64_t * S)
{
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
(S ? blockmix_pwxform : blockmix_salsa8);
const uint64_t * VROM = shared->shared1.aligned;
uint32_t VROM_mask = shared->mask1;
size_t s = 16 * r;
uint64_t * X = V;
uint64_t * Y = &XY[s];
uint64_t * Z = S ? S : &XY[2 * s];
uint64_t n, i, j;
size_t k;
/* 1: X <-- B */
/* 3: V_i <-- X */
for (i = 0; i < 2 * r; i++) {
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
for (k = 0; k < 16; k++)
tmp->w[k] = le32dec(&src->w[k]);
salsa20_simd_shuffle(tmp, dst);
}
/* 4: X <-- H(X) */
/* 3: V_i <-- X */
blockmix(X, Y, Z, r);
blkcpy(&V[s], Y, s);
X = XY;
if (NROM && (VROM_mask & 1)) {
if ((1 & VROM_mask) == 1) {
/* j <-- Integerify(X) mod NROM */
j = integerify(Y, r) & (NROM - 1);
/* X <-- H(X \xor VROM_j) */
blkxor(Y, &VROM[j * s], s);
}
blockmix(Y, X, Z, r);
/* 2: for i = 0 to N - 1 do */
for (n = 1, i = 2; i < N; i += 2) {
/* 3: V_i <-- X */
blkcpy(&V[i * s], X, s);
if ((i & (i - 1)) == 0)
n <<= 1;
/* j <-- Wrap(Integerify(X), i) */
j = integerify(X, r) & (n - 1);
j += i - n;
/* X <-- X \xor V_j */
blkxor(X, &V[j * s], s);
/* 4: X <-- H(X) */
blockmix(X, Y, Z, r);
/* 3: V_i <-- X */
blkcpy(&V[(i + 1) * s], Y, s);
j = integerify(Y, r);
if (((i + 1) & VROM_mask) == 1) {
/* j <-- Integerify(X) mod NROM */
j &= NROM - 1;
/* X <-- H(X \xor VROM_j) */
blkxor(Y, &VROM[j * s], s);
} else {
/* j <-- Wrap(Integerify(X), i) */
j &= n - 1;
j += i + 1 - n;
/* X <-- H(X \xor V_j) */
blkxor(Y, &V[j * s], s);
}
blockmix(Y, X, Z, r);
}
} else {
yescrypt_flags_t rw = flags & YESCRYPT_RW;
/* 4: X <-- H(X) */
blockmix(Y, X, Z, r);
/* 2: for i = 0 to N - 1 do */
for (n = 1, i = 2; i < N; i += 2) {
/* 3: V_i <-- X */
blkcpy(&V[i * s], X, s);
if (rw) {
if ((i & (i - 1)) == 0)
n <<= 1;
/* j <-- Wrap(Integerify(X), i) */
j = integerify(X, r) & (n - 1);
j += i - n;
/* X <-- X \xor V_j */
blkxor(X, &V[j * s], s);
}
/* 4: X <-- H(X) */
blockmix(X, Y, Z, r);
/* 3: V_i <-- X */
blkcpy(&V[(i + 1) * s], Y, s);
if (rw) {
/* j <-- Wrap(Integerify(X), i) */
j = integerify(Y, r) & (n - 1);
j += (i + 1) - n;
/* X <-- X \xor V_j */
blkxor(Y, &V[j * s], s);
}
/* 4: X <-- H(X) */
blockmix(Y, X, Z, r);
}
}
/* B' <-- X */
for (i = 0; i < 2 * r; i++) {
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
for (k = 0; k < 16; k++)
le32enc(&tmp->w[k], src->w[k]);
salsa20_simd_unshuffle(tmp, dst);
}
}
/**
* smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
* Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage XY must be 256r + 64 bytes in length. The value N must be a
* power of 2 greater than 1. The value Nloop must be even.
*/
static void
smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
yescrypt_flags_t flags,
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
uint64_t * XY, uint64_t * S)
{
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
(S ? blockmix_pwxform : blockmix_salsa8);
const uint64_t * VROM = shared->shared1.aligned;
uint32_t VROM_mask = shared->mask1 | 1;
size_t s = 16 * r;
yescrypt_flags_t rw = flags & YESCRYPT_RW;
uint64_t * X = XY;
uint64_t * Y = &XY[s];
uint64_t * Z = S ? S : &XY[2 * s];
uint64_t i, j;
size_t k;
if (Nloop == 0)
return;
/* X <-- B' */
for (i = 0; i < 2 * r; i++) {
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
for (k = 0; k < 16; k++)
tmp->w[k] = le32dec(&src->w[k]);
salsa20_simd_shuffle(tmp, dst);
}
if (NROM) {
/* 6: for i = 0 to N - 1 do */
for (i = 0; i < Nloop; i += 2) {
/* 7: j <-- Integerify(X) mod N */
j = integerify(X, r) & (N - 1);
/* 8: X <-- H(X \xor V_j) */
blkxor(X, &V[j * s], s);
/* V_j <-- Xprev \xor V_j */
if (rw)
blkcpy(&V[j * s], X, s);
blockmix(X, Y, Z, r);
j = integerify(Y, r);
if (((i + 1) & VROM_mask) == 1) {
/* j <-- Integerify(X) mod NROM */
j &= NROM - 1;
/* X <-- H(X \xor VROM_j) */
blkxor(Y, &VROM[j * s], s);
} else {
/* 7: j <-- Integerify(X) mod N */
j &= N - 1;
/* 8: X <-- H(X \xor V_j) */
blkxor(Y, &V[j * s], s);
/* V_j <-- Xprev \xor V_j */
if (rw)
blkcpy(&V[j * s], Y, s);
}
blockmix(Y, X, Z, r);
}
} else {
/* 6: for i = 0 to N - 1 do */
i = Nloop / 2;
do {
/* 7: j <-- Integerify(X) mod N */
j = integerify(X, r) & (N - 1);
/* 8: X <-- H(X \xor V_j) */
blkxor(X, &V[j * s], s);
/* V_j <-- Xprev \xor V_j */
if (rw)
blkcpy(&V[j * s], X, s);
blockmix(X, Y, Z, r);
/* 7: j <-- Integerify(X) mod N */
j = integerify(Y, r) & (N - 1);
/* 8: X <-- H(X \xor V_j) */
blkxor(Y, &V[j * s], s);
/* V_j <-- Xprev \xor V_j */
if (rw)
blkcpy(&V[j * s], Y, s);
blockmix(Y, X, Z, r);
} while (--i);
}
/* 10: B' <-- X */
for (i = 0; i < 2 * r; i++) {
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
for (k = 0; k < 16; k++)
le32enc(&tmp->w[k], src->w[k]);
salsa20_simd_unshuffle(tmp, dst);
}
}
/**
* p2floor(x):
* Largest power of 2 not greater than argument.
*/
static uint64_t
p2floor(uint64_t x)
{
uint64_t y;
while ((y = x & (x - 1)))
x = y;
return x;
}
/**
* smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
* Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the
* temporary storage V must be 128rN bytes in length; the temporary storage
* XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
* required with OpenMP-enabled builds). The value N must be a power of 2
* greater than 1.
*/
static void
smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
yescrypt_flags_t flags,
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
uint64_t * XY, uint64_t * S)
{
size_t s = 16 * r;
uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
uint32_t i;
Nloop_all = Nchunk;
if (flags & YESCRYPT_RW) {
if (t <= 1) {
if (t)
Nloop_all *= 2; /* 2/3 */
Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
} else {
Nloop_all *= t - 1;
}
} else if (t) {
if (t == 1)
Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
Nloop_all *= t;
}
Nloop_rw = 0;
if (flags & __YESCRYPT_INIT_SHARED)
Nloop_rw = Nloop_all;
else if (flags & YESCRYPT_RW)
Nloop_rw = Nloop_all / p;
Nchunk &= ~(uint64_t)1; /* round down to even */
Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
Nloop_rw &= ~(uint64_t)1; /* round down to even */
#ifdef _OPENMP
#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
{
#pragma omp for
#endif
for (i = 0; i < p; i++) {
uint64_t Vchunk = i * Nchunk;
uint64_t * Bp = &B[i * s];
uint64_t * Vp = &V[Vchunk * s];
#ifdef _OPENMP
uint64_t * XYp = &XY[i * (2 * s + 8)];
#else
uint64_t * XYp = XY;
#endif
uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
if (Sp)
smix1(Bp, 1, S_SIZE_ALL / 16,
flags & ~YESCRYPT_PWXFORM,
Sp, NROM, shared, XYp, NULL);
if (!(flags & __YESCRYPT_INIT_SHARED_2))
smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
NROM, shared, XYp, Sp);
}
if (Nloop_all > Nloop_rw) {
#ifdef _OPENMP
#pragma omp for
#endif
for (i = 0; i < p; i++) {
uint64_t * Bp = &B[i * s];
#ifdef _OPENMP
uint64_t * XYp = &XY[i * (2 * s + 8)];
#else
uint64_t * XYp = XY;
#endif
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
smix2(Bp, r, N, Nloop_all - Nloop_rw,
flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
}
}
#ifdef _OPENMP
}
#endif
}
/**
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
* N, r, p, t, flags, buf, buflen):
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
* p, buflen), or a revision of scrypt as requested by flags and shared, and
* write the result into buf. The parameters r, p, and buflen must satisfy
* r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power
* of 2 greater than 1.
*
* t controls computation time while not affecting peak memory usage. shared
* and flags may request special modes as described in yescrypt.h. local is
* the thread-local data structure, allowing to preserve and reuse a memory
* allocation across calls, thereby reducing its overhead.
*
* Return 0 on success; or -1 on error.
*/
int
yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen,
uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
uint8_t * buf, size_t buflen)
{
yescrypt_region_t tmp;
uint64_t NROM;
size_t B_size, V_size, XY_size, need;
uint64_t * B, * V, * XY, * S;
uint64_t sha256[4];
/*
* YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
* so don't let it have side-effects. Without this adjustment, it'd
* enable the SHA-256 password pre-hashing and output post-hashing,
* because any deviation from classic scrypt implies those.
*/
if (p == 1)
flags &= ~YESCRYPT_PARALLEL_SMIX;
/* Sanity-check parameters */
if (flags & ~YESCRYPT_KNOWN_FLAGS) {
errno = EINVAL;
return -1;
}
#if SIZE_MAX > UINT32_MAX
if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
errno = EFBIG;
return -1;
}
#endif
if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
errno = EFBIG;
return -1;
}
if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
errno = EINVAL;
return -1;
}
if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
errno = EINVAL;
return -1;
}
#if S_MIN_R > 1
if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
errno = EINVAL;
return -1;
}
#endif
if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
#if SIZE_MAX / 256 <= UINT32_MAX
(r > SIZE_MAX / 256) ||
#endif
(N > SIZE_MAX / 128 / r)) {
errno = ENOMEM;
return -1;
}
if (N > UINT64_MAX / ((uint64_t)t + 1)) {
errno = EFBIG;
return -1;
}
#ifdef _OPENMP
if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
(N > SIZE_MAX / 128 / (r * p))) {
errno = ENOMEM;
return -1;
}
#endif
if ((flags & YESCRYPT_PWXFORM) &&
#ifndef _OPENMP
(flags & YESCRYPT_PARALLEL_SMIX) &&
#endif
p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
errno = ENOMEM;
return -1;
}
NROM = 0;
if (shared->shared1.aligned) {
NROM = shared->shared1.aligned_size / ((size_t)128 * r);
if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
!(flags & YESCRYPT_RW)) {
errno = EINVAL;
return -1;
}
}
/* Allocate memory */
V = NULL;
V_size = (size_t)128 * r * N;
#ifdef _OPENMP
if (!(flags & YESCRYPT_PARALLEL_SMIX))
V_size *= p;
#endif
need = V_size;
if (flags & __YESCRYPT_INIT_SHARED) {
if (local->aligned_size < need) {
if (local->base || local->aligned ||
local->base_size || local->aligned_size) {
errno = EINVAL;
return -1;
}
if (!alloc_region(local, need))
return -1;
}
V = (uint64_t *)local->aligned;
need = 0;
}
B_size = (size_t)128 * r * p;
need += B_size;
if (need < B_size) {
errno = ENOMEM;
return -1;
}
XY_size = (size_t)256 * r + 64;
#ifdef _OPENMP
XY_size *= p;
#endif
need += XY_size;
if (need < XY_size) {
errno = ENOMEM;
return -1;
}
if (flags & YESCRYPT_PWXFORM) {
size_t S_size = S_SIZE_ALL * sizeof(*S);
#ifdef _OPENMP
S_size *= p;
#else
if (flags & YESCRYPT_PARALLEL_SMIX)
S_size *= p;
#endif
need += S_size;
if (need < S_size) {
errno = ENOMEM;
return -1;
}
}
if (flags & __YESCRYPT_INIT_SHARED) {
if (!alloc_region(&tmp, need))
return -1;
B = (uint64_t *)tmp.aligned;
XY = (uint64_t *)((uint8_t *)B + B_size);
} else {
init_region(&tmp);
if (local->aligned_size < need) {
if (free_region(local))
return -1;
if (!alloc_region(local, need))
return -1;
}
B = (uint64_t *)local->aligned;
V = (uint64_t *)((uint8_t *)B + B_size);
XY = (uint64_t *)((uint8_t *)V + V_size);
}
S = NULL;
if (flags & YESCRYPT_PWXFORM)
S = (uint64_t *)((uint8_t *)XY + XY_size);
if (t || flags) {
SHA256_CTX_Y ctx;
SHA256_Init_Y(&ctx);
SHA256_Update_Y(&ctx, passwd, passwdlen);
SHA256_Final_Y((uint8_t *)sha256, &ctx);
passwd = (uint8_t *)sha256;
passwdlen = sizeof(sha256);
}
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
(uint8_t *)B, B_size);
if (t || flags)
blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
} else {
uint32_t i;
/* 2: for i = 0 to p - 1 do */
#ifdef _OPENMP
#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
#endif
for (i = 0; i < p; i++) {
/* 3: B_i <-- MF(B_i, N) */
#ifdef _OPENMP
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
&V[(size_t)16 * r * i * N],
NROM, shared,
&XY[((size_t)32 * r + 8) * i],
S ? &S[S_SIZE_ALL * i] : S);
#else
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
NROM, shared, XY, S);
#endif
}
}
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
/*
* Except when computing classic scrypt, allow all computation so far
* to be performed on the client. The final steps below match those of
* SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
* far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
* SCRAM's use of SHA-1) would be usable with yescrypt hashes.
*/
if ((t || flags) && buflen == sizeof(sha256)) {
/* Compute ClientKey */
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init(&ctx, buf, buflen);
HMAC_SHA256_Update(&ctx, salt, saltlen);
HMAC_SHA256_Final((uint8_t *)sha256, &ctx);
}
/* Compute StoredKey */
{
SHA256_CTX_Y ctx;
SHA256_Init_Y(&ctx);
SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
SHA256_Final_Y(buf, &ctx);
}
}
if (free_region(&tmp))
return -1;
/* Success! */
return 0;
}

View File

@@ -426,7 +426,7 @@ int64_t yescryptr16_get_max64()
bool register_yescrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT;
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
gate->set_target = (void*)&scrypt_set_target;

708
avxdefs.h
View File

@@ -1,71 +1,96 @@
#ifndef AVXDEFS_H__
#define AVXDEFS_H__
// Some tools to help using AVX and AVX2
// At this time SSE2 is sufficient for all 128 bit code in this file.
// Some tools to help using AVX and AVX2.
// At this time SSE2 is sufficient for all 128 bit code in this file
// but could change without notice.
// 256 bit requires AVX2.
// AVX512 has more powerful 256 bit instructions but with AVX512 available
// there is little reason to use them.
// Proper alignment of data is required, 16 bytes for 128 bit vectors and
// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
// best cache alignment.
//
// There exist dupplicates of some functions. In general the first defined
// is preferred as it is more efficient but also more restrictive and may
// not be applicable. The less efficient versions are more flexible.
#include <inttypes.h>
#include <immintrin.h>
#include <memory.h>
#include <stdbool.h>
//
// 128 bit utilities and shortcuts
//
// Pseudo constants, there are no real vector constants.
// These can't be used for compile time initialization.
// Constant zero
#define mm_zero _mm_setzero_si128()
#define mm_zero _mm_setzero_si128()
// Constant 1
#define mm_one_128 _mm_set_epi64x( 0ULL, 1ULL )
#define mm_one_64 _mm_set1_epi64x( 1ULL )
#define mm_one_32 _mm_set1_epi32( 1UL )
#define mm_one_16 _mm_set1_epi16( 1U )
// Constant minus 1
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFF )
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFUL )
//
// Basic operations without equivalent SIMD intrinsic
// Bitwise not (~x)
#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 )
#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 )
// Unary negation (-a)
#define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a )
#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )
#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a )
//
// Bit operations, functional but not very efficient
// Bit operations
// Return x with bit n set/clear in all elements
#define mm_bitset_128( x, n ) \
_mm_or_si128( _mm_slli_si128( _mm_set_epi64x( 0ULL, 1ULL ), n ) )
#define mm_bitclr_128( x, n ) \
_mm_and_si128( x, mm_not( _mm_slli_si128( \
_mm_set_epi64x( 0ULL, 1ULL ), n ) ) )
#define mm_bitset_64( x, n ) \
_mm_or_si128( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) )
#define mm_bitclr_64( x, n ) \
_mm_and_si128( x, mm_not( _mm_slli_epi64( _mm_set1_epi64x( 1ULL ), n ) ) )
#define mm_bitset_32( x, n ) \
_mm_or_si128( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) )
#define mm_bitclr_32( x, n ) \
_mm_and_si128( x, mm_not( _mm_slli_epi32( _mm_set1_epi32( 1UL ), n ) ) )
#define mm_bitset_16( x, n ) \
_mm_or_si128( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) )
#define mm_bitclr_16( x, n ) \
_mm_and_si128( x, mm_not( _mm_slli_epi16( _mm_set1_epi16( 1U ), n ) ) )
// return vector of bool
#define mm_bittest_128( x, n ) \
_mm_and_si256( _mm_srli_si128( x, n ), _mm_set_epi64x( 0ULL, 1ULL ) )
// Return bit n in position, all other bits zeroed.
#define mm_bitextract_64 ( x, n ) \
_mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
#define mm_bitextract_32 ( x, n ) \
_mm_and_si128( _mm_set1_epi32( 1UL << (n) ), x )
#define mm_bitextract_16 ( x, n ) \
_mm_and_si128( _mm_set1_epi16( 1U << (n) ), x )
// Return bit n as bool
#define mm_bittest_64( x, n ) \
_mm_and_si256( _mm_srli_epi64( x, n ), _mm_set1_epi64x( 1ULL ) )
_mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) )
#define mm_bittest_32( x, n ) \
_mm_and_si256( _mm_srli_epi32( x, n ), _mm_set1_epi32( 1UL ) )
_mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) )
#define mm_bittest_16( x, n ) \
_mm_and_si256( _mm_srli_epi16( x, n ), _mm_set1_epi16( 1U ) )
_mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) )
// Return x with bit n set/cleared in all elements
#define mm_bitset_64( x, n ) \
_mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x )
#define mm_bitclr_64( x, n ) \
_mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x )
#define mm_bitset_32( x, n ) \
_mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x )
#define mm_bitclr_32( x, n ) \
_mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x )
#define mm_bitset_16( x, n ) \
_mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x )
#define mm_bitclr_16( x, n ) \
_mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x )
// Return x with bit n toggled
#define mm_bitflip_64( x, n ) \
_mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x )
#define mm_bitflip_32( x, n ) \
_mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x )
#define mm_bitflip_16( x, n ) \
_mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x )
//
// Memory functions
@@ -86,13 +111,33 @@ inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
}
// Scalar 64 bit copy, n = bytes/8
inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
// Compare data in memory, return true if different
inline bool memcmp_128( __m128i src1, __m128i src2, int n )
{
for ( int i = 0; i < n; i++ )
dst[i] = src[i];
if ( src1[i] != src2[i] ) return true;
return false;
}
// A couple of 64 bit scalar functions
// n = bytes/8
inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = src[i];
}
inline void memset_zero_64( uint64_t *src, int n )
{
for ( int i = 0; i < n; i++ ) src[i] = 0;
}
inline void memset_64( uint64_t *dst, uint64_t a, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = a;
}
//
// Pointer cast
@@ -108,149 +153,136 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
// returns p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
//
// Bit rotations
// XOP is an obsolete AMD feature that has native rotation.
// _mm_roti_epi64( w, c)
// Never implemented by Intel and since removed from Zen by AMD.
// Rotate bits in vector elements
#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
_mm_slli_epi64( w, 64-c ) )
_mm_slli_epi64( w, 64-(c) ) )
#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
_mm_srli_epi64( w, 64-c ) )
_mm_srli_epi64( w, 64-(c) ) )
#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
_mm_slli_epi32( w, 32-c ) )
_mm_slli_epi32( w, 32-(c) ) )
#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
_mm_srli_epi32( w, 32-c ) )
_mm_srli_epi32( w, 32-(c) ) )
#define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \
_mm_slli_epi16( w, 16-c ) )
_mm_slli_epi16( w, 16-(c) ) )
#define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \
_mm_srli_epi16( w, 16-c ) )
_mm_srli_epi16( w, 16-(c) ) )
//
// Shuffle vector elements
// Rotate elements in vector
// Swap upper and lower 64 bits of 128 bit source vector
#define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e )
// Optimized shuffle
// Rotate 128 vector by 1 32 bit element.
// Swap hi/lo 64 bits in 128 bit vector
#define mm_swap_64( w ) _mm_shuffle_epi32( w, 0x4e )
// rotate 128 bit vector by 32 bits
#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 )
#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 )
// Shuffle elements across two 128 bit vectors
// Swap hi/lo 32 bits in each 64 bit element
#define mm_swap64_32( x ) _mm_shuffle_epi32( x, 0xb1 )
// Swap 128 bit source vectors in place.
// Less efficient but more versatile. Use only for odd number rotations.
// Use shuffle above when possible.
// Rotate vector by n bytes.
#define mm_rotr128_x8( w, n ) \
_mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) )
#define mm_rotl128_x8( w, n ) \
_mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) )
// Rotate vector by c elements, use only for odd number rotations
#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 )
#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 )
#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 )
#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 )
//
// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo}
// Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits.
// void mm128_swap128( __m128i, __m128i )
#define mm_swap_128(hi, lo) hi = _mm_xor_si128(hi, lo); \
lo = _mm_xor_si128(hi, lo); \
hi = _mm_xor_si128(hi, lo);
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
#define mm_rotl256_1x64( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
s0 = t; \
} while(0)
#define mm_rotr256_1x64( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
s0 = t; \
} while(0)
#define mm_rotl256_1x32( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
s0 = t; \
} while(0)
#define mm_rotr256_1x32( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
s0 = t; \
} while(0)
// Older slower
#define mm_rotl256_1x64x( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ); \
s1 = mm_swap_64( s1 ); \
t = _mm_or_si128( \
_mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s1 = _mm_or_si128( \
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s0 = t; \
} while(0)
#define mm_rotr256_1x64x( s0, s1 ) \
do { \
__m128i t; \
s0 = mm_swap_64( s0 ) ; \
s1 = mm_swap_64( s1 ); \
t = _mm_or_si128( \
_mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s1 = _mm_or_si128( \
_mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s0 = t; \
} while(0)
// need a better name, not rot, poke? step?
// Return s0 with elements shifted right/left and low/high element from
// s1 shifted into the vacated high/low element of s0.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated s0.
// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not
// completed. It's faster than a full rotation.
inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n )
{
return _mm_or_si128( _mm_srli_si128( s0, n<<2 ),
_mm_slli_si128( s1, 16 - (n<<2) ) );
#define mm_swap_128(hi, lo) \
{ \
hi = _mm_xor_si128(hi, lo); \
lo = _mm_xor_si128(hi, lo); \
hi = _mm_xor_si128(hi, lo); \
}
inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n )
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
#define mm_rotl256_1x64( hi, lo ) \
do { \
__m128i t; \
hi = mm_swap_64( hi ); \
lo = mm_swap_64( lo ); \
t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
hi = t; \
} while(0)
#define mm_rotr256_1x64( hi, lo ) \
do { \
__m128i t; \
hi = mm_swap_64( hi ); \
lo = mm_swap_64( lo ); \
t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
hi = t; \
} while(0)
#define mm_rotl256_1x32( hi, lo ) \
do { \
__m128i t; \
hi = mm_swap_64( hi ); \
lo = mm_swap_64( lo ); \
t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
hi = t; \
} while(0)
#define mm_rotr256_1x32( hi, lo ) \
do { \
__m128i t; \
hi = mm_swap_64( hi ); \
lo = mm_swap_64( lo ); \
t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
hi = t; \
} while(0)
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
{
return _mm_or_si128( _mm_slli_si128( s0, n<<2 ),
_mm_srli_si128( s1, 16 - (n<<2) ) );
return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
_mm_slli_si128( lo, 16 - (n<<2) ) );
}
inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n )
{
return _mm_or_si128( _mm_slli_si128( hi, n<<2 ),
_mm_srli_si128( lo, 16 - (n<<2) ) );
}
//
// Swap bytes in vector elements
inline __m128i mm_byteswap_32( __m128i x )
{
return _mm_shuffle_epi8( x, _mm_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
inline __m128i mm_byteswap_64( __m128i x )
{
return _mm_shuffle_epi8( x, _mm_set_epi8(
@@ -258,96 +290,95 @@ inline __m128i mm_byteswap_64( __m128i x )
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
// older slower
inline __m128i mm_byteswap_32x( __m128i x )
inline __m128i mm_byteswap_32( __m128i x )
{
__m128i x1 = _mm_and_si128( x, _mm_set1_epi32( 0x0000ff00 ) );
__m128i x2 = _mm_and_si128( x, _mm_set1_epi32( 0x00ff0000 ) );
__m128i x0 = _mm_slli_epi32( x, 24 ); // x0 = x << 24
x1 = _mm_slli_epi32( x1, 8 ); // x1 = mask(x) << 8
x2 = _mm_srli_epi32( x2, 8 ); // x2 = mask(x) >> 8
__m128i x3 = _mm_srli_epi32( x, 24 ); // x3 = x >> 24
return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
return _mm_shuffle_epi8( x, _mm_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
inline __m128i mm_byteswap_64x( __m128i x )
inline __m128i mm_byteswap_16( __m128i x )
{
x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 ));
x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
_mm_slli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
_mm_slli_epi64( _mm_and_si128( x,
_mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
return _mm_shuffle_epi8( x, _mm_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
/////////////////////////////////////////////////////////////////////
#if defined (__AVX2__)
//
// 256 bit utilities and Shortcuts
//
// Pseudo constants, there are no real vector constants.
// These can't be used for compile time initialization
// Constant zero
#define mm256_zero _mm256_setzero_si256()
// Constant 1
#define mm256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
#define mm256_one_64 _mm256_set1_epi64x( 1ULL )
#define mm256_one_32 _mm256_set1_epi32( 1UL )
#define mm256_one_16 _mm256_set1_epi16( 1U )
// Constant minus 1
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFF )
#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
//
// Basic operations without SIMD equivalent
// Bitwise not ( ~x )
#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \
// Unary negation ( -a )
#define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a )
#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )
#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a )
//
// Bit operations
// Return x with bit n set/clear in all elements
#define mm256_bitset_128( x, n ) \
_mm256_or_si256( _mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) )
#define mm256_bitclr_128( x, n ) \
_mm256_and_si256( x, mm256_not( \
_mm256_slli_si256( _mm256_set_m128i( 1U, 1U ), n ) ) )
#define mm256_bitset_64( x, n ) \
_mm256_or_si256( x, _mm256_set1_epi64x( 1ULL << n ) )
#define mm256_bitclr_64( x, n ) \
_mm256_and_si256( x, mm256_not( _mm256_set1_epi64x( 1ULL << n ) ) )
#define mm256_bitset_32( x, n ) \
_mm256_or_si256( x, _mm256_set1_epi32( 1UL << n ) )
#define mm256_bitclr_32( x, n ) \
_mm256_and_si256( x, mm256_not( _mm256_set1_epi32( 1UL << n ) ) )
#define mm256_bitset_16( x, n ) \
_mm256_or_si256( x, _mm256_set1_epi16( 1U << n ) )
#define mm256_bitclr_16( x, n ) \
_mm256_and_si256( x, mm256_not( _mm256_set1_epi16( 1U << n ) ) )
// return vector of bool
#define mm256_bittest_128( x, n ) \
_mm256_and_si256( _mm256_srli_si256( x, n ), \
_mm256_set_m128i( _mm_set_epi64x( 0ULL, 1ULL ) ) )
// return bit n in position, all othr bits cleared
#define mm256_bitextract_64 ( x, n ) \
_mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
#define mm256_bitextract_32 ( x, n ) \
_mm256_and_si128( _mm256_set1_epi32( 0UL << (n) ), x )
#define mm256_bitextract_16 ( x, n ) \
_mm256_and_si128( _mm256_set1_epi16( 0U << (n) ), x )
// Return bit n as bool (bit 0)
#define mm256_bittest_64( x, n ) \
_mm256_and_si256( _mm256_srli_epi64( x, n ), \
_mm256_set1_epi64x( 1ULL << n ) )
_mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) )
#define mm256_bittest_32( x, n ) \
_mm256_and_si256( _mm256_srli_epi32( x, n ), \
_mm256_set1_epi32( 1UL << n ) )
_mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) )
#define mm256_bittest_16( x, n ) \
_mm256_and_si256( _mm256_srli_epi16( x, n ), \
_mm256_set1_epi16( 1U << n ) )
_mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) )
// Return x with bit n set/cleared in all elements
#define mm256_bitset_64( x, n ) \
_mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
#define mm256_bitclr_64( x, n ) \
_mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
#define mm256_bitset_32( x, n ) \
_mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
#define mm256_bitclr_32( x, n ) \
_mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
#define mm256_bitset_16( x, n ) \
_mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
#define mm256_bitclr_16( x, n ) \
_mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
// Return x with bit n toggled
#define mm256_bitflip_64( x, n ) \
_mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
#define mm256_bitflip_32( x, n ) \
_mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
#define mm256_bitflip_16( x, n ) \
_mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
//
// Memory functions
@@ -368,6 +399,14 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
}
// Compare data in memory, return true if different
inline bool memcmp_256( __m256i src1, __m256i src2, int n )
{
for ( int i = 0; i < n; i++ )
if ( src1[i] != src2[i] ) return true;
return false;
}
//
// Pointer casting
@@ -383,39 +422,128 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
// returns p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
//
// Bit rotations
//
// Rotate bits in vector elements
// w = packed data, c = number of bits to rotate
// Rotate bits in 64 bit elements
// w = packed 64 bit data, c = number of bits to rotate
#define mm256_rotr_64( w, c ) \
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
_mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) )
#define mm256_rotl_64( w, c ) \
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) )
// Rotate bits in 32 bit elements
_mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) )
#define mm256_rotr_32( w, c ) \
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) )
_mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) )
#define mm256_rotl_32( w, c ) \
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) )
_mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) )
#define mm256_rotr_16( w, c ) \
_mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) )
#define mm256_rotl_16( w, c ) \
_mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) )
//
// Rotate elements in vector
// There is no full vector permute for elements less than 64 bits or 256 bit
// shift, a little more work is needed.
// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements))
// Identical functionality but "f" is AVX and "x" iis AVX2, likely faster.
#define mm256_swap_128( w ) _mm256_permute2x128_si256( w, w, 1 )
//#define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 )
// Optimized 64 bit permutations
// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
#define mm256_swap_128( w ) _mm256_permute4x64_epi64( w, 0x4e )
//#define mm256_swap_128( w ) _mm256_permute2x128_si256( w, w, 1 )
// Rotate vector by one 64 bit element (aka two 32 bit elements)
//__m256i mm256_rotl256_1x64( _mm256i, int )
// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
// Rotate by one 32 bit element (aka two 16 bit elements)
#define mm256_rotl256_1x32( w ) _mm256_shuffle_epi32( w, 0x93 )
#define mm256_rotr256_1x32( w ) _mm256_shuffle_epi32( w, 0x39 )
// Swap hi/lo 64 bits in each 128 bit element
#define mm256_swap128_64( x ) _mm256_shuffle_epi32( x, 0x4e )
// Rotate 128 bit elements by 32 bits
#define mm256_rotr128_1x32( x ) _mm256_shuffle_epi32( x, 0x39 )
#define mm256_rotl128_1x32( x ) _mm256_shuffle_epi32( x, 0x93 )
// Swap hi/lo 32 bits in each 64 bit element
#define mm256_swap64_32( x ) _mm256_shuffle_epi32( x, 0xb1 )
// Less efficient but more versatile. Use only for rotations that are not
// integrals of 64 bits. Use permutations above when possible.
// Rotate 256 bit vector by c bytes.
#define mm256_rotr256_x8( w, c ) \
_mm256_or_si256( _mm256_srli_si256( w, c ), \
mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) )
#define mm256_rotl256_x8( w, c ) \
_mm256_or_si256( _mm256_slli_si256( w, c ), \
mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) )
// Rotate 256 bit vector by c elements, use only for odd value rotations
#define mm256_rotr256_x32( w, c ) mm256_rotr256_x8( w, (c)>>2 )
#define mm256_rotl256_x32( w, c ) mm256_rotl256_x8( w, (c)>>2 )
#define mm256_rotr256_x16( w, c ) mm256_rotr256_x8( w, (c)>>1 )
#define mm256_rotl256_x16( w, c ) mm256_rotl256_x8( w, (c)>>1 )
//
// Rotate two 256 bit vectors as one 512 bit vector
// Fast but limited to 128 bit granularity
#define mm256_swap512_256(a, b) _mm256_permute2x128_si256( a, b, 0x1032 )
#define mm256_rotr512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x0321 )
#define mm256_rotl512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x2103 )
// Much slower, for 64 and 32 bit granularity
#define mm256_rotr512_1x64(a, b) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \
b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \
a = t; \
while (0);
#define mm256_rotl512_1x64(a, b) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \
b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \
a = t; \
while (0);
#define mm256_rotr512_1x32(a, b) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \
b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \
a = t; \
while (0);
#define mm256_rotl512_1x32(a, b) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \
b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \
a = t; \
while (0);
// Byte granularity but even a bit slower
#define mm256_rotr512_x8( a, b, n ) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \
_mm256_slli_epi64( b, ( 32 - (n) ) ) ); \
b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \
_mm256_slli_epi64( a, ( 32 - (n) ) ) ); \
a = t; \
while (0);
#define mm256_rotl512_x8( a, b, n ) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \
_mm256_srli_epi64( b, ( 32 - (n) ) ) ); \
b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \
_mm256_srli_epi64( a, ( 32 - (n) ) ) ); \
a = t; \
while (0);
//
// Swap bytes in vector elements
@@ -438,47 +566,30 @@ inline __m256i mm256_byteswap_32( __m256i x )
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
// older, slower
inline __m256i mm256_byteswap_32x( __m256i x )
inline __m256i mm256_byteswap_16( __m256i x )
{
__m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) );
__m256i x2 = _mm256_and_si256( x, _mm256_set1_epi32( 0x00ff0000 ) );
__m256i x0 = _mm256_slli_epi32( x, 24 ); // x0 = x << 24
x1 = _mm256_slli_epi32( x1, 8 ); // x1 = mask1(x) << 8
x2 = _mm256_srli_epi32( x2, 8 ); // x2 = mask2(x) >> 8
__m256i x3 = _mm256_srli_epi32( x, 24 ); // x3 = x >> 24
return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
_mm256_or_si256( x2, x3 ) );
}
inline __m256i mm256_byteswap_64x( __m256i x )
{
x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ));
x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ),
_mm256_slli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 ));
return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ),
_mm256_slli_epi64( _mm256_and_si256( x,
_mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 ));
return _mm256_shuffle_epi8( x, _mm256_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
// usefulness tbd
// __m128i hi, __m128i lo, returns __m256i
#define mm256_pack_2x128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \
// __m128i hi, __m128i lo, __m256i src
#define mm256_unpack_2x128( hi, lo, src ) \
lo = _mm256_castsi256_si128( src ); \
hi = _mm256_castsi256_si128( mm256_swap_128( src ) );
// hi = _mm256_extracti128_si256( src, 1 );
// Pseudo parallel AES
// Probably noticeably slower than using pure 128 bit vectors
// More efficient if one key for both lanes.
inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
{
__m128i hi, lo, khi, klo;
@@ -487,7 +598,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
mm256_unpack_2x128( khi, klo, k );
lo = _mm_aesenc_si128( lo, klo );
hi = _mm_aesenc_si128( hi, khi );
return mm256_pack_2x128( hi, lo );
}
@@ -498,7 +608,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
mm256_unpack_2x128( hi, lo, x );
lo = _mm_aesenc_si128( lo, mm_zero );
hi = _mm_aesenc_si128( hi, mm_zero );
return mm256_pack_2x128( hi, lo );
}
@@ -533,8 +642,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
// interleave 4 arrays of 32 bit elements for 128 bit processing
// bit_len must be 256, 512 or 640 bits.
// Vector indexing doesn't work with 32 bit data.
// There's no vector indexing here!!!
inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
const void *src2, const void *src3, int bit_len )
{
@@ -591,8 +698,6 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void *src1,
}
}
// doesn't work with 32 bit elements
// no vector indexing here?
inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
{
@@ -632,7 +737,6 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
}
// deinterleave 4 arrays into individual buffers for scalarm processing
// bit_len must be multiple of 32
inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
@@ -656,7 +760,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
#if defined (__AVX2__)
// Interleave 4 source buffers containing 64 bit data into the destination
// buffer
// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
inline void mm256_interleave_4x64( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3, int bit_len )
{
@@ -682,6 +786,17 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
if ( bit_len <= 640 ) return;
d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
// bit_len == 1024
}
// Slower version
@@ -705,7 +820,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
}
// Deinterleave 4 buffers of 64 bit data from the source buffer.
// bit_len must be 256, 512 or 640 bits.
// bit_len must be 256, 512, 640 or 1024 bits.
// Requires overrun padding for 640 bit len.
inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, int bit_len )
@@ -730,11 +845,26 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
if ( bit_len <= 512 ) return;
// null change to overrun area
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
if ( bit_len <= 640 )
{
// null change to overrun area
d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
return;
}
d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
// bit_len == 1024
}
// Slower version
@@ -785,9 +915,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
s3[4], s2[4], s1[4], s0[4] );
d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
s3[5], s2[5], s1[5], s0[5] );
d [6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
s3[6], s2[6], s1[6], s0[6] );
d [7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
s3[7], s2[7], s1[7], s0[7] );
if ( bit_len <= 256 ) return;
@@ -904,22 +1034,22 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
d = ((uint32_t*)d1) + 8;
d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[153], s[145], s[137], s[129] );
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d2) + 8;
d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[154], s[146], s[138], s[130]);
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d3) + 8;
d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[155], s[147], s[139], s[131] );
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d4) + 8;
d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[156], s[148], s[140], s[132] );
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d5) + 8;
d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[157], s[149], s[141], s[133] );
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d6) + 8;
d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[158], s[150], s[142], s[134] );
d = ((uint32_t*)d1) + 8;
d = ((uint32_t*)d7) + 8;
d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
s[159], s[151], s[143], s[135] );
}

View File

@@ -1,10 +1,5 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
make distclean || echo clean
@@ -12,14 +7,8 @@ make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
make -j 4

View File

@@ -3,7 +3,7 @@
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-4way.exe
@@ -13,7 +13,7 @@ mv cpuminer cpuminer-4way
make clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -23,7 +23,7 @@ mv cpuminer cpuminer-aes-avx2
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx.exe
@@ -33,7 +33,7 @@ mv cpuminer cpuminer-aes-avx
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -43,7 +43,7 @@ mv cpuminer cpuminer-aes-sse42
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse42.exe
@@ -53,7 +53,7 @@ mv cpuminer cpuminer-sse42
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse2.exe

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.7.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.7'
PACKAGE_STRING='cpuminer-opt 3.7.7'
PACKAGE_VERSION='3.7.8'
PACKAGE_STRING='cpuminer-opt 3.7.8'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.7 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.7:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.7.7
cpuminer-opt configure 3.7.8
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.7, which was
It was created by cpuminer-opt $as_me 3.7.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.7.7'
VERSION='3.7.8'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.7.7, which was
This file was extended by cpuminer-opt $as_me 3.7.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.7.7
cpuminer-opt config.status 3.7.8
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.7])
AC_INIT([cpuminer-opt], [3.7.8])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -358,8 +358,8 @@ struct work {
char *job_id;
size_t xnonce2_len;
unsigned char *xnonce2;
uint32_t nonces[4];
bool nfound[4];
uint32_t nonces[8];
bool nfound[8];
};
struct stratum_job {

82
winbuild-cross.sh Executable file
View File

@@ -0,0 +1,82 @@
#!/bin/bash
LOCAL_LIB="$HOME/usr/lib"
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
F="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
mkdir release
cp README.txt release/
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libstdc++-6.dll release/
cp /usr/lib/gcc/x86_64-w64-mingw32/5.3-win32/libgcc_s_seh-1.dll release/
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-4way-sha.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
make
mv cpuminer.exe release/cpuminer-4way.exe
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-avx-sha.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-avx2.exe
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=znver1 -Wall" ./configure $F
#make -j
#strip -s cpuminer.exe
#mv cpuminer.exe release/cpuminer-aes-sha.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-avx.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-aes-sse42.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=corei7 -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse42.exe
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=core2 -Wall" ./configure $F
make
strip -s cpuminer.exe
mv cpuminer.exe release/cpuminer-sse2.exe
make clean || echo clean